--- _id: '52958' author: - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Aswin Shanmugam full_name: Subramanian, Aswin Shanmugam last_name: Subramanian - first_name: Gordon full_name: Wichern, Gordon last_name: Wichern - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach - first_name: Jonathan full_name: Le Roux, Jonathan last_name: Le Roux citation: ama: 'Boeddeker C, Subramanian AS, Wichern G, Haeb-Umbach R, Le Roux J. TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings. IEEE/ACM Transactions on Audio, Speech, and Language Processing. 2024;32:1185-1197. doi:10.1109/taslp.2024.3350887' apa: 'Boeddeker, C., Subramanian, A. S., Wichern, G., Haeb-Umbach, R., & Le Roux, J. (2024). TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings. IEEE/ACM Transactions on Audio, Speech, and Language Processing, 32, 1185–1197. https://doi.org/10.1109/taslp.2024.3350887' bibtex: '@article{Boeddeker_Subramanian_Wichern_Haeb-Umbach_Le Roux_2024, title={TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings}, volume={32}, DOI={10.1109/taslp.2024.3350887}, journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Boeddeker, Christoph and Subramanian, Aswin Shanmugam and Wichern, Gordon and Haeb-Umbach, Reinhold and Le Roux, Jonathan}, year={2024}, pages={1185–1197} }' chicago: 'Boeddeker, Christoph, Aswin Shanmugam Subramanian, Gordon Wichern, Reinhold Haeb-Umbach, and Jonathan Le Roux. “TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings.” IEEE/ACM Transactions on Audio, Speech, and Language Processing 32 (2024): 1185–97. https://doi.org/10.1109/taslp.2024.3350887.' ieee: 'C. Boeddeker, A. S. Subramanian, G. Wichern, R. Haeb-Umbach, and J. Le Roux, “TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings,” IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 32, pp. 1185–1197, 2024, doi: 10.1109/taslp.2024.3350887.' mla: 'Boeddeker, Christoph, et al. “TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings.” IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 32, Institute of Electrical and Electronics Engineers (IEEE), 2024, pp. 1185–97, doi:10.1109/taslp.2024.3350887.' short: C. Boeddeker, A.S. Subramanian, G. Wichern, R. Haeb-Umbach, J. Le Roux, IEEE/ACM Transactions on Audio, Speech, and Language Processing 32 (2024) 1185–1197. date_created: 2024-03-26T16:11:54Z date_updated: 2024-03-26T16:16:34Z department: - _id: '54' doi: 10.1109/taslp.2024.3350887 intvolume: ' 32' keyword: - Electrical and Electronic Engineering - Acoustics and Ultrasonics - Computer Science (miscellaneous) - Computational Mathematics language: - iso: eng main_file_link: - open_access: '1' url: https://arxiv.org/abs/2303.03849 oa: '1' page: 1185-1197 publication: IEEE/ACM Transactions on Audio, Speech, and Language Processing publication_identifier: issn: - 2329-9290 - 2329-9304 publication_status: published publisher: Institute of Electrical and Electronics Engineers (IEEE) status: public title: 'TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings' type: journal_article user_id: '40767' volume: 32 year: '2024' ... --- _id: '47128' author: - first_name: Tobias full_name: Cord-Landwehr, Tobias id: '44393' last_name: Cord-Landwehr - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Cătălin full_name: Zorilă, Cătălin last_name: Zorilă - first_name: Rama full_name: Doddipatla, Rama last_name: Doddipatla - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'Cord-Landwehr T, Boeddeker C, Zorilă C, Doddipatla R, Haeb-Umbach R. Frame-Wise and Overlap-Robust Speaker Embeddings for Meeting Diarization. In: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE; 2023. doi:10.1109/icassp49357.2023.10095370' apa: Cord-Landwehr, T., Boeddeker, C., Zorilă, C., Doddipatla, R., & Haeb-Umbach, R. (2023). Frame-Wise and Overlap-Robust Speaker Embeddings for Meeting Diarization. ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 2023 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Rhodes. https://doi.org/10.1109/icassp49357.2023.10095370 bibtex: '@inproceedings{Cord-Landwehr_Boeddeker_Zorilă_Doddipatla_Haeb-Umbach_2023, title={Frame-Wise and Overlap-Robust Speaker Embeddings for Meeting Diarization}, DOI={10.1109/icassp49357.2023.10095370}, booktitle={ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, publisher={IEEE}, author={Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}, year={2023} }' chicago: Cord-Landwehr, Tobias, Christoph Boeddeker, Cătălin Zorilă, Rama Doddipatla, and Reinhold Haeb-Umbach. “Frame-Wise and Overlap-Robust Speaker Embeddings for Meeting Diarization.” In ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2023. https://doi.org/10.1109/icassp49357.2023.10095370. ieee: 'T. Cord-Landwehr, C. Boeddeker, C. Zorilă, R. Doddipatla, and R. Haeb-Umbach, “Frame-Wise and Overlap-Robust Speaker Embeddings for Meeting Diarization,” presented at the 2023 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Rhodes, 2023, doi: 10.1109/icassp49357.2023.10095370.' mla: Cord-Landwehr, Tobias, et al. “Frame-Wise and Overlap-Robust Speaker Embeddings for Meeting Diarization.” ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), IEEE, 2023, doi:10.1109/icassp49357.2023.10095370. short: 'T. Cord-Landwehr, C. Boeddeker, C. Zorilă, R. Doddipatla, R. Haeb-Umbach, in: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), IEEE, 2023.' conference: location: Rhodes name: 2023 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) date_created: 2023-09-19T14:01:20Z date_updated: 2023-11-15T14:56:27Z ddc: - '000' department: - _id: '54' doi: 10.1109/icassp49357.2023.10095370 file: - access_level: open_access content_type: application/pdf creator: cord date_created: 2023-11-15T14:56:18Z date_updated: 2023-11-15T14:56:18Z file_id: '48932' file_name: teacher_student_embeddings.pdf file_size: 246306 relation: main_file file_date_updated: 2023-11-15T14:56:18Z has_accepted_license: '1' language: - iso: eng oa: '1' project: - _id: '52' name: 'PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing' publication: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) publication_status: published publisher: IEEE status: public title: Frame-Wise and Overlap-Robust Speaker Embeddings for Meeting Diarization type: conference user_id: '44393' year: '2023' ... --- _id: '47129' author: - first_name: Tobias full_name: Cord-Landwehr, Tobias id: '44393' last_name: Cord-Landwehr - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Cătălin full_name: Zorilă, Cătălin last_name: Zorilă - first_name: Rama full_name: Doddipatla, Rama last_name: Doddipatla - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'Cord-Landwehr T, Boeddeker C, Zorilă C, Doddipatla R, Haeb-Umbach R. A Teacher-Student Approach for Extracting Informative Speaker Embeddings From Speech Mixtures. In: INTERSPEECH 2023. ISCA; 2023. doi:10.21437/interspeech.2023-1379' apa: Cord-Landwehr, T., Boeddeker, C., Zorilă, C., Doddipatla, R., & Haeb-Umbach, R. (2023). A Teacher-Student Approach for Extracting Informative Speaker Embeddings From Speech Mixtures. INTERSPEECH 2023. https://doi.org/10.21437/interspeech.2023-1379 bibtex: '@inproceedings{Cord-Landwehr_Boeddeker_Zorilă_Doddipatla_Haeb-Umbach_2023, title={A Teacher-Student Approach for Extracting Informative Speaker Embeddings From Speech Mixtures}, DOI={10.21437/interspeech.2023-1379}, booktitle={INTERSPEECH 2023}, publisher={ISCA}, author={Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}, year={2023} }' chicago: Cord-Landwehr, Tobias, Christoph Boeddeker, Cătălin Zorilă, Rama Doddipatla, and Reinhold Haeb-Umbach. “A Teacher-Student Approach for Extracting Informative Speaker Embeddings From Speech Mixtures.” In INTERSPEECH 2023. ISCA, 2023. https://doi.org/10.21437/interspeech.2023-1379. ieee: 'T. Cord-Landwehr, C. Boeddeker, C. Zorilă, R. Doddipatla, and R. Haeb-Umbach, “A Teacher-Student Approach for Extracting Informative Speaker Embeddings From Speech Mixtures,” 2023, doi: 10.21437/interspeech.2023-1379.' mla: Cord-Landwehr, Tobias, et al. “A Teacher-Student Approach for Extracting Informative Speaker Embeddings From Speech Mixtures.” INTERSPEECH 2023, ISCA, 2023, doi:10.21437/interspeech.2023-1379. short: 'T. Cord-Landwehr, C. Boeddeker, C. Zorilă, R. Doddipatla, R. Haeb-Umbach, in: INTERSPEECH 2023, ISCA, 2023.' date_created: 2023-09-19T14:34:37Z date_updated: 2023-11-15T15:00:12Z ddc: - '000' department: - _id: '54' doi: 10.21437/interspeech.2023-1379 file: - access_level: open_access content_type: application/pdf creator: cord date_created: 2023-11-15T15:00:02Z date_updated: 2023-11-15T15:00:02Z file_id: '48933' file_name: multispeaker_embeddings.pdf file_size: 303203 relation: main_file file_date_updated: 2023-11-15T15:00:02Z has_accepted_license: '1' language: - iso: eng oa: '1' project: - _id: '52' name: 'PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing' publication: INTERSPEECH 2023 publication_status: published publisher: ISCA status: public title: A Teacher-Student Approach for Extracting Informative Speaker Embeddings From Speech Mixtures type: conference user_id: '44393' year: '2023' ... --- _id: '48391' author: - first_name: Rohith full_name: Aralikatti, Rohith last_name: Aralikatti - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Gordon full_name: Wichern, Gordon last_name: Wichern - first_name: Aswin full_name: Subramanian, Aswin last_name: Subramanian - first_name: Jonathan full_name: Le Roux, Jonathan last_name: Le Roux citation: ama: 'Aralikatti R, Boeddeker C, Wichern G, Subramanian A, Le Roux J. Reverberation as Supervision For Speech Separation. In: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE; 2023. doi:10.1109/icassp49357.2023.10095022' apa: Aralikatti, R., Boeddeker, C., Wichern, G., Subramanian, A., & Le Roux, J. (2023). Reverberation as Supervision For Speech Separation. ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). https://doi.org/10.1109/icassp49357.2023.10095022 bibtex: '@inproceedings{Aralikatti_Boeddeker_Wichern_Subramanian_Le Roux_2023, title={Reverberation as Supervision For Speech Separation}, DOI={10.1109/icassp49357.2023.10095022}, booktitle={ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, publisher={IEEE}, author={Aralikatti, Rohith and Boeddeker, Christoph and Wichern, Gordon and Subramanian, Aswin and Le Roux, Jonathan}, year={2023} }' chicago: Aralikatti, Rohith, Christoph Boeddeker, Gordon Wichern, Aswin Subramanian, and Jonathan Le Roux. “Reverberation as Supervision For Speech Separation.” In ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2023. https://doi.org/10.1109/icassp49357.2023.10095022. ieee: 'R. Aralikatti, C. Boeddeker, G. Wichern, A. Subramanian, and J. Le Roux, “Reverberation as Supervision For Speech Separation,” 2023, doi: 10.1109/icassp49357.2023.10095022.' mla: Aralikatti, Rohith, et al. “Reverberation as Supervision For Speech Separation.” ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), IEEE, 2023, doi:10.1109/icassp49357.2023.10095022. short: 'R. Aralikatti, C. Boeddeker, G. Wichern, A. Subramanian, J. Le Roux, in: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), IEEE, 2023.' date_created: 2023-10-23T15:09:13Z date_updated: 2023-10-23T15:10:16Z department: - _id: '54' doi: 10.1109/icassp49357.2023.10095022 language: - iso: eng publication: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) publication_status: published publisher: IEEE status: public title: Reverberation as Supervision For Speech Separation type: conference user_id: '40767' year: '2023' ... --- _id: '48390' author: - first_name: Simon full_name: Berger, Simon last_name: Berger - first_name: Peter full_name: Vieting, Peter last_name: Vieting - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Ralf full_name: Schlüter, Ralf last_name: Schlüter - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'Berger S, Vieting P, Boeddeker C, Schlüter R, Haeb-Umbach R. Mixture Encoder for Joint Speech Separation and Recognition. In: INTERSPEECH 2023. ISCA; 2023. doi:10.21437/interspeech.2023-1815' apa: Berger, S., Vieting, P., Boeddeker, C., Schlüter, R., & Haeb-Umbach, R. (2023). Mixture Encoder for Joint Speech Separation and Recognition. INTERSPEECH 2023. https://doi.org/10.21437/interspeech.2023-1815 bibtex: '@inproceedings{Berger_Vieting_Boeddeker_Schlüter_Haeb-Umbach_2023, title={Mixture Encoder for Joint Speech Separation and Recognition}, DOI={10.21437/interspeech.2023-1815}, booktitle={INTERSPEECH 2023}, publisher={ISCA}, author={Berger, Simon and Vieting, Peter and Boeddeker, Christoph and Schlüter, Ralf and Haeb-Umbach, Reinhold}, year={2023} }' chicago: Berger, Simon, Peter Vieting, Christoph Boeddeker, Ralf Schlüter, and Reinhold Haeb-Umbach. “Mixture Encoder for Joint Speech Separation and Recognition.” In INTERSPEECH 2023. ISCA, 2023. https://doi.org/10.21437/interspeech.2023-1815. ieee: 'S. Berger, P. Vieting, C. Boeddeker, R. Schlüter, and R. Haeb-Umbach, “Mixture Encoder for Joint Speech Separation and Recognition,” 2023, doi: 10.21437/interspeech.2023-1815.' mla: Berger, Simon, et al. “Mixture Encoder for Joint Speech Separation and Recognition.” INTERSPEECH 2023, ISCA, 2023, doi:10.21437/interspeech.2023-1815. short: 'S. Berger, P. Vieting, C. Boeddeker, R. Schlüter, R. Haeb-Umbach, in: INTERSPEECH 2023, ISCA, 2023.' date_created: 2023-10-23T15:06:39Z date_updated: 2023-10-23T15:10:19Z department: - _id: '54' doi: 10.21437/interspeech.2023-1815 language: - iso: eng publication: INTERSPEECH 2023 publication_status: published publisher: ISCA status: public title: Mixture Encoder for Joint Speech Separation and Recognition type: conference user_id: '40767' year: '2023' ... --- _id: '35602' abstract: - lang: eng text: "Continuous Speech Separation (CSS) has been proposed to address speech overlaps during the analysis of realistic meeting-like conversations by eliminating any overlaps before further processing.\r\nCSS separates a recording of arbitrarily many speakers into a small number of overlap-free output channels, where each output channel may contain speech of multiple speakers.\r\nThis is often done by applying a conventional separation model trained with Utterance-level Permutation Invariant Training (uPIT), which exclusively maps a speaker to an output channel, in sliding window approach called stitching.\r\nRecently, we introduced an alternative training scheme called Graph-PIT that teaches the separation network to directly produce output streams in the required format without stitching.\r\nIt can handle an arbitrary number of speakers as long as never more of them overlap at the same time than the separator has output channels.\r\nIn this contribution, we further investigate the Graph-PIT training scheme.\r\nWe show in extended experiments that models trained with Graph-PIT also work in challenging reverberant conditions.\r\nModels trained in this way are able to perform segment-less CSS, i.e., without stitching, and achieve comparable and often better separation quality than the conventional CSS with uPIT and stitching.\r\nWe simplify the training schedule for Graph-PIT with the recently proposed Source Aggregated Signal-to-Distortion Ratio (SA-SDR) loss.\r\nIt eliminates unfavorable properties of the previously used A-SDR loss and thus enables training with Graph-PIT from scratch.\r\nGraph-PIT training relaxes the constraints w.r.t. the allowed numbers of speakers and speaking patterns which allows using a larger variety of training data.\r\nFurthermore, we introduce novel signal-level evaluation metrics for meeting scenarios, namely the source-aggregated scale- and convolution-invariant Signal-to-Distortion Ratio (SA-SI-SDR and SA-CI-SDR), which are generalizations of the commonly used SDR-based metrics for the CSS case." article_type: original author: - first_name: Thilo full_name: von Neumann, Thilo id: '49870' last_name: von Neumann orcid: https://orcid.org/0000-0002-7717-8670 - first_name: Keisuke full_name: Kinoshita, Keisuke last_name: Kinoshita - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Marc full_name: Delcroix, Marc last_name: Delcroix - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'von Neumann T, Kinoshita K, Boeddeker C, Delcroix M, Haeb-Umbach R. Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria. IEEE/ACM Transactions on Audio, Speech, and Language Processing. 2023;31:576-589. doi:10.1109/taslp.2022.3228629' apa: 'von Neumann, T., Kinoshita, K., Boeddeker, C., Delcroix, M., & Haeb-Umbach, R. (2023). Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria. IEEE/ACM Transactions on Audio, Speech, and Language Processing, 31, 576–589. https://doi.org/10.1109/taslp.2022.3228629' bibtex: '@article{von Neumann_Kinoshita_Boeddeker_Delcroix_Haeb-Umbach_2023, title={Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria}, volume={31}, DOI={10.1109/taslp.2022.3228629}, journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2023}, pages={576–589} }' chicago: 'Neumann, Thilo von, Keisuke Kinoshita, Christoph Boeddeker, Marc Delcroix, and Reinhold Haeb-Umbach. “Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria.” IEEE/ACM Transactions on Audio, Speech, and Language Processing 31 (2023): 576–89. https://doi.org/10.1109/taslp.2022.3228629.' ieee: 'T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, and R. Haeb-Umbach, “Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria,” IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 31, pp. 576–589, 2023, doi: 10.1109/taslp.2022.3228629.' mla: 'von Neumann, Thilo, et al. “Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria.” IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 31, Institute of Electrical and Electronics Engineers (IEEE), 2023, pp. 576–89, doi:10.1109/taslp.2022.3228629.' short: T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, R. Haeb-Umbach, IEEE/ACM Transactions on Audio, Speech, and Language Processing 31 (2023) 576–589. date_created: 2023-01-09T17:24:17Z date_updated: 2023-11-15T12:16:11Z ddc: - '000' department: - _id: '54' doi: 10.1109/taslp.2022.3228629 file: - access_level: open_access content_type: application/pdf creator: haebumb date_created: 2023-01-09T17:46:05Z date_updated: 2023-01-11T08:50:19Z file_id: '35607' file_name: main.pdf file_size: 7185077 relation: main_file file_date_updated: 2023-01-11T08:50:19Z has_accepted_license: '1' intvolume: ' 31' keyword: - Continuous Speech Separation - Source Separation - Graph-PIT - Dynamic Programming - Permutation Invariant Training language: - iso: eng oa: '1' page: 576-589 project: - _id: '52' name: 'PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing' publication: IEEE/ACM Transactions on Audio, Speech, and Language Processing publication_identifier: issn: - 2329-9290 - 2329-9304 publication_status: published publisher: Institute of Electrical and Electronics Engineers (IEEE) quality_controlled: '1' status: public title: 'Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria' type: journal_article user_id: '49870' volume: 31 year: '2023' ... --- _id: '48281' abstract: - lang: eng text: "\tWe propose a general framework to compute the word error rate (WER) of ASR systems that process recordings containing multiple speakers at their input and that produce multiple output word sequences (MIMO).\r\n\tSuch ASR systems are typically required, e.g., for meeting transcription.\r\n\tWe provide an efficient implementation based on a dynamic programming search in a multi-dimensional Levenshtein distance tensor under the constraint that a reference utterance must be matched consistently with one hypothesis output. \r\n\tThis also results in an efficient implementation of the ORC WER which previously suffered from exponential complexity.\r\n\tWe give an overview of commonly used WER definitions for multi-speaker scenarios and show that they are specializations of the above MIMO WER tuned to particular application scenarios. \r\n\tWe conclude with a discussion of the pros and cons of the various WER definitions and a recommendation when to use which." author: - first_name: Thilo full_name: von Neumann, Thilo id: '49870' last_name: von Neumann orcid: https://orcid.org/0000-0002-7717-8670 - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Keisuke full_name: Kinoshita, Keisuke last_name: Kinoshita - first_name: Marc full_name: Delcroix, Marc last_name: Delcroix - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'von Neumann T, Boeddeker C, Kinoshita K, Delcroix M, Haeb-Umbach R. On Word Error Rate Definitions and Their Efficient Computation for Multi-Speaker Speech Recognition Systems. In: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE; 2023. doi:10.1109/icassp49357.2023.10094784' apa: von Neumann, T., Boeddeker, C., Kinoshita, K., Delcroix, M., & Haeb-Umbach, R. (2023). On Word Error Rate Definitions and Their Efficient Computation for Multi-Speaker Speech Recognition Systems. ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). https://doi.org/10.1109/icassp49357.2023.10094784 bibtex: '@inproceedings{von Neumann_Boeddeker_Kinoshita_Delcroix_Haeb-Umbach_2023, title={On Word Error Rate Definitions and Their Efficient Computation for Multi-Speaker Speech Recognition Systems}, DOI={10.1109/icassp49357.2023.10094784}, booktitle={ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, publisher={IEEE}, author={von Neumann, Thilo and Boeddeker, Christoph and Kinoshita, Keisuke and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2023} }' chicago: Neumann, Thilo von, Christoph Boeddeker, Keisuke Kinoshita, Marc Delcroix, and Reinhold Haeb-Umbach. “On Word Error Rate Definitions and Their Efficient Computation for Multi-Speaker Speech Recognition Systems.” In ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2023. https://doi.org/10.1109/icassp49357.2023.10094784. ieee: 'T. von Neumann, C. Boeddeker, K. Kinoshita, M. Delcroix, and R. Haeb-Umbach, “On Word Error Rate Definitions and Their Efficient Computation for Multi-Speaker Speech Recognition Systems,” 2023, doi: 10.1109/icassp49357.2023.10094784.' mla: von Neumann, Thilo, et al. “On Word Error Rate Definitions and Their Efficient Computation for Multi-Speaker Speech Recognition Systems.” ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), IEEE, 2023, doi:10.1109/icassp49357.2023.10094784. short: 'T. von Neumann, C. Boeddeker, K. Kinoshita, M. Delcroix, R. Haeb-Umbach, in: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), IEEE, 2023.' date_created: 2023-10-19T07:38:31Z date_updated: 2023-11-15T12:14:15Z ddc: - '000' department: - _id: '54' doi: 10.1109/icassp49357.2023.10094784 file: - access_level: open_access content_type: application/pdf creator: tvn date_created: 2023-10-19T07:39:57Z date_updated: 2023-10-19T07:41:56Z file_id: '48282' file_name: ICASSP_2023_Meeting_Evaluation.pdf file_size: 204994 relation: main_file file_date_updated: 2023-10-19T07:41:56Z has_accepted_license: '1' keyword: - Word Error Rate - Meeting Recognition - Levenshtein Distance language: - iso: eng main_file_link: - url: https://ieeexplore.ieee.org/document/10094784 oa: '1' publication: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) publication_status: published publisher: IEEE quality_controlled: '1' related_material: link: - relation: software url: https://github.com/fgnt/meeteval status: public title: On Word Error Rate Definitions and Their Efficient Computation for Multi-Speaker Speech Recognition Systems type: conference user_id: '49870' year: '2023' ... --- _id: '48275' abstract: - lang: eng text: "MeetEval is an open-source toolkit to evaluate all kinds of meeting transcription systems.\r\nIt provides a unified interface for the computation of commonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER along other WER definitions.\r\nWe extend the cpWER computation by a temporal constraint to ensure that only words are identified as correct when the temporal alignment is plausible.\r\nThis leads to a better quality of the matching of the hypothesis string to the reference string that more closely resembles the actual transcription quality, and a system is penalized if it provides poor time annotations.\r\nSince word-level timing information is often not available, we present a way to approximate exact word-level timings from segment-level timings (e.g., a sentence) and show that the approximation leads to a similar WER as a matching with exact word-level annotations.\r\nAt the same time, the time constraint leads to a speedup of the matching algorithm, which outweighs the additional overhead caused by processing the time stamps." author: - first_name: Thilo full_name: von Neumann, Thilo id: '49870' last_name: von Neumann orcid: https://orcid.org/0000-0002-7717-8670 - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Marc full_name: Delcroix, Marc last_name: Delcroix - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'von Neumann T, Boeddeker C, Delcroix M, Haeb-Umbach R. MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems. In: Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments. ; 2023.' apa: 'von Neumann, T., Boeddeker, C., Delcroix, M., & Haeb-Umbach, R. (2023). MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems. Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments. CHiME 2023 Workshop on Speech Processing in Everyday Environments, Dublin.' bibtex: '@inproceedings{von Neumann_Boeddeker_Delcroix_Haeb-Umbach_2023, title={MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems}, booktitle={Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments}, author={von Neumann, Thilo and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2023} }' chicago: 'Neumann, Thilo von, Christoph Boeddeker, Marc Delcroix, and Reinhold Haeb-Umbach. “MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems.” In Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments, 2023.' ieee: 'T. von Neumann, C. Boeddeker, M. Delcroix, and R. Haeb-Umbach, “MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems,” presented at the CHiME 2023 Workshop on Speech Processing in Everyday Environments, Dublin, 2023.' mla: 'von Neumann, Thilo, et al. “MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems.” Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments, 2023.' short: 'T. von Neumann, C. Boeddeker, M. Delcroix, R. Haeb-Umbach, in: Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments, 2023.' conference: location: Dublin name: CHiME 2023 Workshop on Speech Processing in Everyday Environments date_created: 2023-10-19T07:24:51Z date_updated: 2023-11-15T12:14:02Z ddc: - '000' department: - _id: '54' file: - access_level: open_access content_type: application/pdf creator: tvn date_created: 2023-10-19T07:19:59Z date_updated: 2023-10-19T07:19:59Z file_id: '48276' file_name: Chime_7__MeetEval.pdf file_size: 263744 relation: main_file file_date_updated: 2023-10-19T07:19:59Z has_accepted_license: '1' keyword: - Speech Recognition - Word Error Rate - Meeting Transcription language: - iso: eng main_file_link: - open_access: '1' url: https://arxiv.org/abs/2307.11394 oa: '1' project: - _id: '52' name: 'PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing' publication: Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments quality_controlled: '1' related_material: link: - relation: software url: https://github.com/fgnt/meeteval status: public title: 'MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems' type: conference user_id: '49870' year: '2023' ... --- _id: '33669' abstract: - lang: eng text: Far-field multi-speaker automatic speech recognition (ASR) has drawn increasing attention in recent years. Most existing methods feature a signal processing frontend and an ASR backend. In realistic scenarios, these modules are usually trained separately or progressively, which suffers from either inter-module mismatch or a complicated training process. In this paper, we propose an end-to-end multi-channel model that jointly optimizes the speech enhancement (including speech dereverberation, denoising, and separation) frontend and the ASR backend as a single system. To the best of our knowledge, this is the first work that proposes to optimize dereverberation, beamforming, and multi-speaker ASR in a fully end-to-end manner. The frontend module consists of a weighted prediction error (WPE) based submodule for dereverberation and a neural beamformer for denoising and speech separation. For the backend, we adopt a widely used end-to-end (E2E) ASR architecture. It is worth noting that the entire model is differentiable and can be optimized in a fully end-to-end manner using only the ASR criterion, without the need of parallel signal-level labels. We evaluate the proposed model on several multi-speaker benchmark datasets, and experimental results show that the fully E2E ASR model can achieve competitive performance on both noisy and reverberant conditions, with over 30% relative word error rate (WER) reduction over the single-channel baseline systems. author: - first_name: Wangyou full_name: Zhang, Wangyou last_name: Zhang - first_name: Xuankai full_name: Chang, Xuankai last_name: Chang - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Tomohiro full_name: Nakatani, Tomohiro last_name: Nakatani - first_name: Shinji full_name: Watanabe, Shinji last_name: Watanabe - first_name: Yanmin full_name: Qian, Yanmin last_name: Qian citation: ama: Zhang W, Chang X, Boeddeker C, Nakatani T, Watanabe S, Qian Y. End-to-End Dereverberation, Beamforming, and Speech Recognition in A Cocktail Party. IEEE/ACM Transactions on Audio, Speech, and Language Processing. Published online 2022. doi:10.1109/TASLP.2022.3209942 apa: Zhang, W., Chang, X., Boeddeker, C., Nakatani, T., Watanabe, S., & Qian, Y. (2022). End-to-End Dereverberation, Beamforming, and Speech Recognition in A Cocktail Party. IEEE/ACM Transactions on Audio, Speech, and Language Processing. https://doi.org/10.1109/TASLP.2022.3209942 bibtex: '@article{Zhang_Chang_Boeddeker_Nakatani_Watanabe_Qian_2022, title={End-to-End Dereverberation, Beamforming, and Speech Recognition in A Cocktail Party}, DOI={10.1109/TASLP.2022.3209942}, journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, author={Zhang, Wangyou and Chang, Xuankai and Boeddeker, Christoph and Nakatani, Tomohiro and Watanabe, Shinji and Qian, Yanmin}, year={2022} }' chicago: Zhang, Wangyou, Xuankai Chang, Christoph Boeddeker, Tomohiro Nakatani, Shinji Watanabe, and Yanmin Qian. “End-to-End Dereverberation, Beamforming, and Speech Recognition in A Cocktail Party.” IEEE/ACM Transactions on Audio, Speech, and Language Processing, 2022. https://doi.org/10.1109/TASLP.2022.3209942. ieee: 'W. Zhang, X. Chang, C. Boeddeker, T. Nakatani, S. Watanabe, and Y. Qian, “End-to-End Dereverberation, Beamforming, and Speech Recognition in A Cocktail Party,” IEEE/ACM Transactions on Audio, Speech, and Language Processing, 2022, doi: 10.1109/TASLP.2022.3209942.' mla: Zhang, Wangyou, et al. “End-to-End Dereverberation, Beamforming, and Speech Recognition in A Cocktail Party.” IEEE/ACM Transactions on Audio, Speech, and Language Processing, 2022, doi:10.1109/TASLP.2022.3209942. short: W. Zhang, X. Chang, C. Boeddeker, T. Nakatani, S. Watanabe, Y. Qian, IEEE/ACM Transactions on Audio, Speech, and Language Processing (2022). date_created: 2022-10-11T07:27:51Z date_updated: 2022-12-05T12:35:31Z ddc: - '000' department: - _id: '54' doi: 10.1109/TASLP.2022.3209942 file: - access_level: open_access content_type: application/pdf creator: huesera date_created: 2022-10-11T07:23:13Z date_updated: 2022-10-11T07:23:13Z file_id: '33674' file_name: End-to-End_Dereverberation_Beamforming_and_Speech_Recognition_in_A_Cocktail_Party.pdf file_size: 6167931 relation: main_file file_date_updated: 2022-10-11T07:23:13Z has_accepted_license: '1' language: - iso: eng oa: '1' publication: IEEE/ACM Transactions on Audio, Speech, and Language Processing publication_identifier: issn: - 'Print ISSN: 2329-9290 Electronic ISSN: 2329-9304' publication_status: published related_material: link: - relation: confirmation url: https://ieeexplore.ieee.org/abstract/document/9904314 status: public title: End-to-End Dereverberation, Beamforming, and Speech Recognition in A Cocktail Party type: journal_article user_id: '40767' year: '2022' ... --- _id: '33954' author: - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Tobias full_name: Cord-Landwehr, Tobias id: '44393' last_name: Cord-Landwehr - first_name: Thilo full_name: von Neumann, Thilo id: '49870' last_name: von Neumann orcid: https://orcid.org/0000-0002-7717-8670 - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'Boeddeker C, Cord-Landwehr T, von Neumann T, Haeb-Umbach R. An Initialization Scheme for Meeting Separation with Spatial Mixture Models. In: Interspeech 2022. ISCA; 2022. doi:10.21437/interspeech.2022-10929' apa: Boeddeker, C., Cord-Landwehr, T., von Neumann, T., & Haeb-Umbach, R. (2022). An Initialization Scheme for Meeting Separation with Spatial Mixture Models. Interspeech 2022. https://doi.org/10.21437/interspeech.2022-10929 bibtex: '@inproceedings{Boeddeker_Cord-Landwehr_von Neumann_Haeb-Umbach_2022, title={An Initialization Scheme for Meeting Separation with Spatial Mixture Models}, DOI={10.21437/interspeech.2022-10929}, booktitle={Interspeech 2022}, publisher={ISCA}, author={Boeddeker, Christoph and Cord-Landwehr, Tobias and von Neumann, Thilo and Haeb-Umbach, Reinhold}, year={2022} }' chicago: Boeddeker, Christoph, Tobias Cord-Landwehr, Thilo von Neumann, and Reinhold Haeb-Umbach. “An Initialization Scheme for Meeting Separation with Spatial Mixture Models.” In Interspeech 2022. ISCA, 2022. https://doi.org/10.21437/interspeech.2022-10929. ieee: 'C. Boeddeker, T. Cord-Landwehr, T. von Neumann, and R. Haeb-Umbach, “An Initialization Scheme for Meeting Separation with Spatial Mixture Models,” 2022, doi: 10.21437/interspeech.2022-10929.' mla: Boeddeker, Christoph, et al. “An Initialization Scheme for Meeting Separation with Spatial Mixture Models.” Interspeech 2022, ISCA, 2022, doi:10.21437/interspeech.2022-10929. short: 'C. Boeddeker, T. Cord-Landwehr, T. von Neumann, R. Haeb-Umbach, in: Interspeech 2022, ISCA, 2022.' date_created: 2022-10-28T10:53:56Z date_updated: 2022-10-28T10:57:22Z department: - _id: '54' doi: 10.21437/interspeech.2022-10929 language: - iso: eng main_file_link: - open_access: '1' url: https://www.isca-speech.org/archive/pdfs/interspeech_2022/boeddeker22_interspeech.pdf oa: '1' project: - _id: '52' name: 'PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing' publication: Interspeech 2022 publication_status: published publisher: ISCA status: public title: An Initialization Scheme for Meeting Separation with Spatial Mixture Models type: conference user_id: '40767' year: '2022' ... --- _id: '33958' abstract: - lang: eng text: Recent speaker diarization studies showed that integration of end-to-end neural diarization (EEND) and clustering-based diarization is a promising approach for achieving state-of-the-art performance on various tasks. Such an approach first divides an observed signal into fixed-length segments, then performs {\it segment-level} local diarization based on an EEND module, and merges the segment-level results via clustering to form a final global diarization result. The segmentation is done to limit the number of speakers in each segment since the current EEND cannot handle a large number of speakers. In this paper, we argue that such an approach involving the segmentation has several issues; for example, it inevitably faces a dilemma that larger segment sizes increase both the context available for enhancing the performance and the number of speakers for the local EEND module to handle. To resolve such a problem, this paper proposes a novel framework that performs diarization without segmentation. However, it can still handle challenging data containing many speakers and a significant amount of overlapping speech. The proposed method can take an entire meeting for inference and perform {\it utterance-by-utterance} diarization that clusters utterance activities in terms of speakers. To this end, we leverage a neural network training scheme called Graph-PIT proposed recently for neural source separation. Experiments with simulated active-meeting-like data and CALLHOME data show the superiority of the proposed approach over the conventional methods. author: - first_name: Keisuke full_name: Kinoshita, Keisuke last_name: Kinoshita - first_name: Thilo full_name: von Neumann, Thilo id: '49870' last_name: von Neumann orcid: https://orcid.org/0000-0002-7717-8670 - first_name: Marc full_name: Delcroix, Marc last_name: Delcroix - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'Kinoshita K, von Neumann T, Delcroix M, Boeddeker C, Haeb-Umbach R. Utterance-by-utterance overlap-aware neural diarization with Graph-PIT. In: Proc. Interspeech 2022. ISCA; 2022:1486-1490. doi:10.21437/Interspeech.2022-11408' apa: Kinoshita, K., von Neumann, T., Delcroix, M., Boeddeker, C., & Haeb-Umbach, R. (2022). Utterance-by-utterance overlap-aware neural diarization with Graph-PIT. Proc. Interspeech 2022, 1486–1490. https://doi.org/10.21437/Interspeech.2022-11408 bibtex: '@inproceedings{Kinoshita_von Neumann_Delcroix_Boeddeker_Haeb-Umbach_2022, title={Utterance-by-utterance overlap-aware neural diarization with Graph-PIT}, DOI={10.21437/Interspeech.2022-11408}, booktitle={Proc. Interspeech 2022}, publisher={ISCA}, author={Kinoshita, Keisuke and von Neumann, Thilo and Delcroix, Marc and Boeddeker, Christoph and Haeb-Umbach, Reinhold}, year={2022}, pages={1486–1490} }' chicago: Kinoshita, Keisuke, Thilo von Neumann, Marc Delcroix, Christoph Boeddeker, and Reinhold Haeb-Umbach. “Utterance-by-Utterance Overlap-Aware Neural Diarization with Graph-PIT.” In Proc. Interspeech 2022, 1486–90. ISCA, 2022. https://doi.org/10.21437/Interspeech.2022-11408. ieee: 'K. Kinoshita, T. von Neumann, M. Delcroix, C. Boeddeker, and R. Haeb-Umbach, “Utterance-by-utterance overlap-aware neural diarization with Graph-PIT,” in Proc. Interspeech 2022, 2022, pp. 1486–1490, doi: 10.21437/Interspeech.2022-11408.' mla: Kinoshita, Keisuke, et al. “Utterance-by-Utterance Overlap-Aware Neural Diarization with Graph-PIT.” Proc. Interspeech 2022, ISCA, 2022, pp. 1486–90, doi:10.21437/Interspeech.2022-11408. short: 'K. Kinoshita, T. von Neumann, M. Delcroix, C. Boeddeker, R. Haeb-Umbach, in: Proc. Interspeech 2022, ISCA, 2022, pp. 1486–1490.' conference: name: Interspeech 2022 date_created: 2022-10-28T12:07:57Z date_updated: 2023-11-15T12:17:04Z department: - _id: '54' doi: 10.21437/Interspeech.2022-11408 language: - iso: eng page: 1486-1490 publication: Proc. Interspeech 2022 publication_status: published publisher: ISCA quality_controlled: '1' status: public title: Utterance-by-utterance overlap-aware neural diarization with Graph-PIT type: conference user_id: '49870' year: '2022' ... --- _id: '33819' author: - first_name: Thilo full_name: von Neumann, Thilo id: '49870' last_name: von Neumann orcid: https://orcid.org/0000-0002-7717-8670 - first_name: Keisuke full_name: Kinoshita, Keisuke last_name: Kinoshita - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Marc full_name: Delcroix, Marc last_name: Delcroix - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'von Neumann T, Kinoshita K, Boeddeker C, Delcroix M, Haeb-Umbach R. SA-SDR: A Novel Loss Function for Separation of Meeting Style Data. In: ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE; 2022. doi:10.1109/icassp43922.2022.9746757' apa: 'von Neumann, T., Kinoshita, K., Boeddeker, C., Delcroix, M., & Haeb-Umbach, R. (2022). SA-SDR: A Novel Loss Function for Separation of Meeting Style Data. ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). https://doi.org/10.1109/icassp43922.2022.9746757' bibtex: '@inproceedings{von Neumann_Kinoshita_Boeddeker_Delcroix_Haeb-Umbach_2022, title={SA-SDR: A Novel Loss Function for Separation of Meeting Style Data}, DOI={10.1109/icassp43922.2022.9746757}, booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, publisher={IEEE}, author={von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2022} }' chicago: 'Neumann, Thilo von, Keisuke Kinoshita, Christoph Boeddeker, Marc Delcroix, and Reinhold Haeb-Umbach. “SA-SDR: A Novel Loss Function for Separation of Meeting Style Data.” In ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2022. https://doi.org/10.1109/icassp43922.2022.9746757.' ieee: 'T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, and R. Haeb-Umbach, “SA-SDR: A Novel Loss Function for Separation of Meeting Style Data,” 2022, doi: 10.1109/icassp43922.2022.9746757.' mla: 'von Neumann, Thilo, et al. “SA-SDR: A Novel Loss Function for Separation of Meeting Style Data.” ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), IEEE, 2022, doi:10.1109/icassp43922.2022.9746757.' short: 'T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, R. Haeb-Umbach, in: ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), IEEE, 2022.' date_created: 2022-10-20T05:29:12Z date_updated: 2023-11-15T12:16:47Z ddc: - '000' department: - _id: '54' doi: 10.1109/icassp43922.2022.9746757 file: - access_level: open_access content_type: application/pdf creator: tvn date_created: 2022-10-20T05:33:10Z date_updated: 2022-10-20T05:33:10Z file_id: '33820' file_name: main.pdf file_size: 228069 relation: main_file - access_level: open_access content_type: application/pdf creator: tvn date_created: 2022-10-20T05:35:32Z date_updated: 2022-10-20T05:35:32Z file_id: '33821' file_name: poster.pdf file_size: 229166 relation: poster file_date_updated: 2022-10-20T05:35:32Z has_accepted_license: '1' language: - iso: eng oa: '1' project: - _id: '52' name: 'PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing' publication: ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) publication_status: published publisher: IEEE quality_controlled: '1' related_material: link: - relation: supplementary_material url: https://github.com/fgnt/graph_pit status: public title: 'SA-SDR: A Novel Loss Function for Separation of Meeting Style Data' type: conference user_id: '49870' year: '2022' ... --- _id: '33847' abstract: - lang: eng text: "The scope of speech enhancement has changed from a monolithic view of single,\r\nindependent tasks, to a joint processing of complex conversational speech\r\nrecordings. Training and evaluation of these single tasks requires synthetic\r\ndata with access to intermediate signals that is as close as possible to the\r\nevaluation scenario. As such data often is not available, many works instead\r\nuse specialized databases for the training of each system component, e.g\r\nWSJ0-mix for source separation. We present a Multi-purpose Multi-Speaker\r\nMixture Signal Generator (MMS-MSG) for generating a variety of speech mixture\r\nsignals based on any speech corpus, ranging from classical anechoic mixtures\r\n(e.g., WSJ0-mix) over reverberant mixtures (e.g., SMS-WSJ) to meeting-style\r\ndata. Its highly modular and flexible structure allows for the simulation of\r\ndiverse environments and dynamic mixing, while simultaneously enabling an easy\r\nextension and modification to generate new scenarios and mixture types. These\r\nmeetings can be used for prototyping, evaluation, or training purposes. We\r\nprovide example evaluation data and baseline results for meetings based on the\r\nWSJ corpus. Further, we demonstrate the usefulness for realistic scenarios by\r\nusing MMS-MSG to provide training data for the LibriCSS database." author: - first_name: Tobias full_name: Cord-Landwehr, Tobias id: '44393' last_name: Cord-Landwehr - first_name: Thilo full_name: von Neumann, Thilo id: '49870' last_name: von Neumann orcid: https://orcid.org/0000-0002-7717-8670 - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'Cord-Landwehr T, von Neumann T, Boeddeker C, Haeb-Umbach R. MMS-MSG: A Multi-purpose Multi-Speaker Mixture Signal Generator. In: 2022 International Workshop on Acoustic Signal Enhancement (IWAENC). ; 2022.' apa: 'Cord-Landwehr, T., von Neumann, T., Boeddeker, C., & Haeb-Umbach, R. (2022). MMS-MSG: A Multi-purpose Multi-Speaker Mixture Signal Generator. 2022 International Workshop on Acoustic Signal Enhancement (IWAENC). 2022 International Workshop on Acoustic Signal Enhancement (IWAENC), Bamberg.' bibtex: '@inproceedings{Cord-Landwehr_von Neumann_Boeddeker_Haeb-Umbach_2022, title={MMS-MSG: A Multi-purpose Multi-Speaker Mixture Signal Generator}, booktitle={2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}, author={Cord-Landwehr, Tobias and von Neumann, Thilo and Boeddeker, Christoph and Haeb-Umbach, Reinhold}, year={2022} }' chicago: 'Cord-Landwehr, Tobias, Thilo von Neumann, Christoph Boeddeker, and Reinhold Haeb-Umbach. “MMS-MSG: A Multi-Purpose Multi-Speaker Mixture Signal Generator.” In 2022 International Workshop on Acoustic Signal Enhancement (IWAENC), 2022.' ieee: 'T. Cord-Landwehr, T. von Neumann, C. Boeddeker, and R. Haeb-Umbach, “MMS-MSG: A Multi-purpose Multi-Speaker Mixture Signal Generator,” presented at the 2022 International Workshop on Acoustic Signal Enhancement (IWAENC), Bamberg, 2022.' mla: 'Cord-Landwehr, Tobias, et al. “MMS-MSG: A Multi-Purpose Multi-Speaker Mixture Signal Generator.” 2022 International Workshop on Acoustic Signal Enhancement (IWAENC), 2022.' short: 'T. Cord-Landwehr, T. von Neumann, C. Boeddeker, R. Haeb-Umbach, in: 2022 International Workshop on Acoustic Signal Enhancement (IWAENC), 2022.' conference: location: Bamberg name: 2022 International Workshop on Acoustic Signal Enhancement (IWAENC) date_created: 2022-10-20T14:02:14Z date_updated: 2023-11-15T14:55:14Z ddc: - '000' department: - _id: '54' external_id: arxiv: - '2209.11494' file: - access_level: open_access content_type: application/pdf creator: cord date_created: 2023-11-15T14:54:56Z date_updated: 2023-11-15T14:54:56Z file_id: '48931' file_name: mms_msg_camera_ready.pdf file_size: 177975 relation: main_file file_date_updated: 2023-11-15T14:54:56Z has_accepted_license: '1' language: - iso: eng oa: '1' project: - _id: '52' name: 'PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing' publication: 2022 International Workshop on Acoustic Signal Enhancement (IWAENC) quality_controlled: '1' status: public title: 'MMS-MSG: A Multi-purpose Multi-Speaker Mixture Signal Generator' type: conference user_id: '44393' year: '2022' ... --- _id: '33848' abstract: - lang: eng text: "Impressive progress in neural network-based single-channel speech source\r\nseparation has been made in recent years. But those improvements have been\r\nmostly reported on anechoic data, a situation that is hardly met in practice.\r\nTaking the SepFormer as a starting point, which achieves state-of-the-art\r\nperformance on anechoic mixtures, we gradually modify it to optimize its\r\nperformance on reverberant mixtures. Although this leads to a word error rate\r\nimprovement by 7 percentage points compared to the standard SepFormer\r\nimplementation, the system ends up with only marginally better performance than\r\na PIT-BLSTM separation system, that is optimized with rather straightforward\r\nmeans. This is surprising and at the same time sobering, challenging the\r\npractical usefulness of many improvements reported in recent years for monaural\r\nsource separation on nonreverberant data." author: - first_name: Tobias full_name: Cord-Landwehr, Tobias id: '44393' last_name: Cord-Landwehr - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Thilo full_name: von Neumann, Thilo id: '49870' last_name: von Neumann orcid: https://orcid.org/0000-0002-7717-8670 - first_name: Catalin full_name: Zorila, Catalin last_name: Zorila - first_name: Rama full_name: Doddipatla, Rama last_name: Doddipatla - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'Cord-Landwehr T, Boeddeker C, von Neumann T, Zorila C, Doddipatla R, Haeb-Umbach R. Monaural source separation: From anechoic to reverberant environments. In: 2022 International Workshop on Acoustic Signal Enhancement (IWAENC). IEEE; 2022.' apa: 'Cord-Landwehr, T., Boeddeker, C., von Neumann, T., Zorila, C., Doddipatla, R., & Haeb-Umbach, R. (2022). Monaural source separation: From anechoic to reverberant environments. 2022 International Workshop on Acoustic Signal Enhancement (IWAENC). 2022 International Workshop on Acoustic Signal Enhancement (IWAENC).' bibtex: '@inproceedings{Cord-Landwehr_Boeddeker_von Neumann_Zorila_Doddipatla_Haeb-Umbach_2022, place={Bamberg}, title={Monaural source separation: From anechoic to reverberant environments}, booktitle={2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}, publisher={IEEE}, author={Cord-Landwehr, Tobias and Boeddeker, Christoph and von Neumann, Thilo and Zorila, Catalin and Doddipatla, Rama and Haeb-Umbach, Reinhold}, year={2022} }' chicago: 'Cord-Landwehr, Tobias, Christoph Boeddeker, Thilo von Neumann, Catalin Zorila, Rama Doddipatla, and Reinhold Haeb-Umbach. “Monaural Source Separation: From Anechoic to Reverberant Environments.” In 2022 International Workshop on Acoustic Signal Enhancement (IWAENC). Bamberg: IEEE, 2022.' ieee: 'T. Cord-Landwehr, C. Boeddeker, T. von Neumann, C. Zorila, R. Doddipatla, and R. Haeb-Umbach, “Monaural source separation: From anechoic to reverberant environments,” presented at the 2022 International Workshop on Acoustic Signal Enhancement (IWAENC), 2022.' mla: 'Cord-Landwehr, Tobias, et al. “Monaural Source Separation: From Anechoic to Reverberant Environments.” 2022 International Workshop on Acoustic Signal Enhancement (IWAENC), IEEE, 2022.' short: 'T. Cord-Landwehr, C. Boeddeker, T. von Neumann, C. Zorila, R. Doddipatla, R. Haeb-Umbach, in: 2022 International Workshop on Acoustic Signal Enhancement (IWAENC), IEEE, Bamberg, 2022.' conference: name: 2022 International Workshop on Acoustic Signal Enhancement (IWAENC) date_created: 2022-10-20T14:07:28Z date_updated: 2023-11-15T14:53:06Z ddc: - '000' department: - _id: '54' external_id: arxiv: - '2111.07578' file: - access_level: open_access content_type: application/pdf creator: cord date_created: 2023-11-15T14:52:16Z date_updated: 2023-11-15T14:52:16Z file_id: '48930' file_name: monaural_source_separation.pdf file_size: 212890 relation: main_file file_date_updated: 2023-11-15T14:52:16Z has_accepted_license: '1' language: - iso: eng oa: '1' place: Bamberg project: - _id: '52' name: 'PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing' publication: 2022 International Workshop on Acoustic Signal Enhancement (IWAENC) publisher: IEEE status: public title: 'Monaural source separation: From anechoic to reverberant environments' type: conference user_id: '44393' year: '2022' ... --- _id: '33816' author: - first_name: Tobias full_name: Gburrek, Tobias id: '44006' last_name: Gburrek - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Thilo full_name: von Neumann, Thilo id: '49870' last_name: von Neumann orcid: https://orcid.org/0000-0002-7717-8670 - first_name: Tobias full_name: Cord-Landwehr, Tobias id: '44393' last_name: Cord-Landwehr - first_name: Joerg full_name: Schmalenstroeer, Joerg id: '460' last_name: Schmalenstroeer - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: Gburrek T, Boeddeker C, von Neumann T, Cord-Landwehr T, Schmalenstroeer J, Haeb-Umbach R. A Meeting Transcription System for an Ad-Hoc Acoustic Sensor Network. arXiv; 2022. doi:10.48550/ARXIV.2205.00944 apa: Gburrek, T., Boeddeker, C., von Neumann, T., Cord-Landwehr, T., Schmalenstroeer, J., & Haeb-Umbach, R. (2022). A Meeting Transcription System for an Ad-Hoc Acoustic Sensor Network. arXiv. https://doi.org/10.48550/ARXIV.2205.00944 bibtex: '@book{Gburrek_Boeddeker_von Neumann_Cord-Landwehr_Schmalenstroeer_Haeb-Umbach_2022, title={A Meeting Transcription System for an Ad-Hoc Acoustic Sensor Network}, DOI={10.48550/ARXIV.2205.00944}, publisher={arXiv}, author={Gburrek, Tobias and Boeddeker, Christoph and von Neumann, Thilo and Cord-Landwehr, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}, year={2022} }' chicago: Gburrek, Tobias, Christoph Boeddeker, Thilo von Neumann, Tobias Cord-Landwehr, Joerg Schmalenstroeer, and Reinhold Haeb-Umbach. A Meeting Transcription System for an Ad-Hoc Acoustic Sensor Network. arXiv, 2022. https://doi.org/10.48550/ARXIV.2205.00944. ieee: T. Gburrek, C. Boeddeker, T. von Neumann, T. Cord-Landwehr, J. Schmalenstroeer, and R. Haeb-Umbach, A Meeting Transcription System for an Ad-Hoc Acoustic Sensor Network. arXiv, 2022. mla: Gburrek, Tobias, et al. A Meeting Transcription System for an Ad-Hoc Acoustic Sensor Network. arXiv, 2022, doi:10.48550/ARXIV.2205.00944. short: T. Gburrek, C. Boeddeker, T. von Neumann, T. Cord-Landwehr, J. Schmalenstroeer, R. Haeb-Umbach, A Meeting Transcription System for an Ad-Hoc Acoustic Sensor Network, arXiv, 2022. date_created: 2022-10-18T11:10:58Z date_updated: 2023-11-17T06:42:16Z ddc: - '004' department: - _id: '54' doi: 10.48550/ARXIV.2205.00944 file: - access_level: open_access content_type: application/pdf creator: tgburrek date_created: 2023-11-17T06:42:04Z date_updated: 2023-11-17T06:42:04Z file_id: '48992' file_name: meeting_transcription_22.pdf file_size: 199006 relation: main_file file_date_updated: 2023-11-17T06:42:04Z has_accepted_license: '1' language: - iso: eng oa: '1' project: - _id: '52' name: 'PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing' publisher: arXiv status: public title: A Meeting Transcription System for an Ad-Hoc Acoustic Sensor Network type: misc user_id: '44006' year: '2022' ... --- _id: '28256' author: - first_name: Wangyou full_name: Zhang, Wangyou last_name: Zhang - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Shinji full_name: Watanabe, Shinji last_name: Watanabe - first_name: Tomohiro full_name: Nakatani, Tomohiro last_name: Nakatani - first_name: Marc full_name: Delcroix, Marc last_name: Delcroix - first_name: Keisuke full_name: Kinoshita, Keisuke last_name: Kinoshita - first_name: Tsubasa full_name: Ochiai, Tsubasa last_name: Ochiai - first_name: Naoyuki full_name: Kamo, Naoyuki last_name: Kamo - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach - first_name: Yanmin full_name: Qian, Yanmin last_name: Qian citation: ama: 'Zhang W, Boeddeker C, Watanabe S, et al. End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend. In: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). ; 2021. doi:10.1109/icassp39728.2021.9414464' apa: Zhang, W., Boeddeker, C., Watanabe, S., Nakatani, T., Delcroix, M., Kinoshita, K., Ochiai, T., Kamo, N., Haeb-Umbach, R., & Qian, Y. (2021). End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend. ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). https://doi.org/10.1109/icassp39728.2021.9414464 bibtex: '@inproceedings{Zhang_Boeddeker_Watanabe_Nakatani_Delcroix_Kinoshita_Ochiai_Kamo_Haeb-Umbach_Qian_2021, title={End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend}, DOI={10.1109/icassp39728.2021.9414464}, booktitle={ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={Zhang, Wangyou and Boeddeker, Christoph and Watanabe, Shinji and Nakatani, Tomohiro and Delcroix, Marc and Kinoshita, Keisuke and Ochiai, Tsubasa and Kamo, Naoyuki and Haeb-Umbach, Reinhold and Qian, Yanmin}, year={2021} }' chicago: Zhang, Wangyou, Christoph Boeddeker, Shinji Watanabe, Tomohiro Nakatani, Marc Delcroix, Keisuke Kinoshita, Tsubasa Ochiai, Naoyuki Kamo, Reinhold Haeb-Umbach, and Yanmin Qian. “End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend.” In ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021. https://doi.org/10.1109/icassp39728.2021.9414464. ieee: 'W. Zhang et al., “End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend,” 2021, doi: 10.1109/icassp39728.2021.9414464.' mla: Zhang, Wangyou, et al. “End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend.” ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021, doi:10.1109/icassp39728.2021.9414464. short: 'W. Zhang, C. Boeddeker, S. Watanabe, T. Nakatani, M. Delcroix, K. Kinoshita, T. Ochiai, N. Kamo, R. Haeb-Umbach, Y. Qian, in: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021.' date_created: 2021-12-03T11:31:42Z date_updated: 2022-01-13T08:31:27Z department: - _id: '54' doi: 10.1109/icassp39728.2021.9414464 language: - iso: eng publication: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) publication_status: published status: public title: End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend type: conference user_id: '40767' year: '2021' ... --- _id: '28262' author: - first_name: Chenda full_name: Li, Chenda last_name: Li - first_name: Jing full_name: Shi, Jing last_name: Shi - first_name: Wangyou full_name: Zhang, Wangyou last_name: Zhang - first_name: Aswin Shanmugam full_name: Subramanian, Aswin Shanmugam last_name: Subramanian - first_name: Xuankai full_name: Chang, Xuankai last_name: Chang - first_name: Naoyuki full_name: Kamo, Naoyuki last_name: Kamo - first_name: Moto full_name: Hira, Moto last_name: Hira - first_name: Tomoki full_name: Hayashi, Tomoki last_name: Hayashi - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Zhuo full_name: Chen, Zhuo last_name: Chen - first_name: Shinji full_name: Watanabe, Shinji last_name: Watanabe citation: ama: 'Li C, Shi J, Zhang W, et al. ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration. In: 2021 IEEE Spoken Language Technology Workshop (SLT). ; 2021. doi:10.1109/slt48900.2021.9383615' apa: 'Li, C., Shi, J., Zhang, W., Subramanian, A. S., Chang, X., Kamo, N., Hira, M., Hayashi, T., Boeddeker, C., Chen, Z., & Watanabe, S. (2021). ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration. 2021 IEEE Spoken Language Technology Workshop (SLT). https://doi.org/10.1109/slt48900.2021.9383615' bibtex: '@inproceedings{Li_Shi_Zhang_Subramanian_Chang_Kamo_Hira_Hayashi_Boeddeker_Chen_et al._2021, title={ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration}, DOI={10.1109/slt48900.2021.9383615}, booktitle={2021 IEEE Spoken Language Technology Workshop (SLT)}, author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and et al.}, year={2021} }' chicago: 'Li, Chenda, Jing Shi, Wangyou Zhang, Aswin Shanmugam Subramanian, Xuankai Chang, Naoyuki Kamo, Moto Hira, et al. “ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration.” In 2021 IEEE Spoken Language Technology Workshop (SLT), 2021. https://doi.org/10.1109/slt48900.2021.9383615.' ieee: 'C. Li et al., “ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration,” 2021, doi: 10.1109/slt48900.2021.9383615.' mla: 'Li, Chenda, et al. “ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration.” 2021 IEEE Spoken Language Technology Workshop (SLT), 2021, doi:10.1109/slt48900.2021.9383615.' short: 'C. Li, J. Shi, W. Zhang, A.S. Subramanian, X. Chang, N. Kamo, M. Hira, T. Hayashi, C. Boeddeker, Z. Chen, S. Watanabe, in: 2021 IEEE Spoken Language Technology Workshop (SLT), 2021.' date_created: 2021-12-03T12:07:35Z date_updated: 2022-01-13T08:34:25Z department: - _id: '54' doi: 10.1109/slt48900.2021.9383615 language: - iso: eng publication: 2021 IEEE Spoken Language Technology Workshop (SLT) publication_status: published status: public title: 'ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration' type: conference user_id: '40767' year: '2021' ... --- _id: '28261' author: - first_name: Chenda full_name: Li, Chenda last_name: Li - first_name: Yi full_name: Luo, Yi last_name: Luo - first_name: Cong full_name: Han, Cong last_name: Han - first_name: Jinyu full_name: Li, Jinyu last_name: Li - first_name: Takuya full_name: Yoshioka, Takuya last_name: Yoshioka - first_name: Tianyan full_name: Zhou, Tianyan last_name: Zhou - first_name: Marc full_name: Delcroix, Marc last_name: Delcroix - first_name: Keisuke full_name: Kinoshita, Keisuke last_name: Kinoshita - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Yanmin full_name: Qian, Yanmin last_name: Qian - first_name: Shinji full_name: Watanabe, Shinji last_name: Watanabe - first_name: Zhuo full_name: Chen, Zhuo last_name: Chen citation: ama: 'Li C, Luo Y, Han C, et al. Dual-Path RNN for Long Recording Speech Separation. In: 2021 IEEE Spoken Language Technology Workshop (SLT). ; 2021. doi:10.1109/slt48900.2021.9383514' apa: Li, C., Luo, Y., Han, C., Li, J., Yoshioka, T., Zhou, T., Delcroix, M., Kinoshita, K., Boeddeker, C., Qian, Y., Watanabe, S., & Chen, Z. (2021). Dual-Path RNN for Long Recording Speech Separation. 2021 IEEE Spoken Language Technology Workshop (SLT). https://doi.org/10.1109/slt48900.2021.9383514 bibtex: '@inproceedings{Li_Luo_Han_Li_Yoshioka_Zhou_Delcroix_Kinoshita_Boeddeker_Qian_et al._2021, title={Dual-Path RNN for Long Recording Speech Separation}, DOI={10.1109/slt48900.2021.9383514}, booktitle={2021 IEEE Spoken Language Technology Workshop (SLT)}, author={Li, Chenda and Luo, Yi and Han, Cong and Li, Jinyu and Yoshioka, Takuya and Zhou, Tianyan and Delcroix, Marc and Kinoshita, Keisuke and Boeddeker, Christoph and Qian, Yanmin and et al.}, year={2021} }' chicago: Li, Chenda, Yi Luo, Cong Han, Jinyu Li, Takuya Yoshioka, Tianyan Zhou, Marc Delcroix, et al. “Dual-Path RNN for Long Recording Speech Separation.” In 2021 IEEE Spoken Language Technology Workshop (SLT), 2021. https://doi.org/10.1109/slt48900.2021.9383514. ieee: 'C. Li et al., “Dual-Path RNN for Long Recording Speech Separation,” 2021, doi: 10.1109/slt48900.2021.9383514.' mla: Li, Chenda, et al. “Dual-Path RNN for Long Recording Speech Separation.” 2021 IEEE Spoken Language Technology Workshop (SLT), 2021, doi:10.1109/slt48900.2021.9383514. short: 'C. Li, Y. Luo, C. Han, J. Li, T. Yoshioka, T. Zhou, M. Delcroix, K. Kinoshita, C. Boeddeker, Y. Qian, S. Watanabe, Z. Chen, in: 2021 IEEE Spoken Language Technology Workshop (SLT), 2021.' date_created: 2021-12-03T12:07:03Z date_updated: 2022-01-13T08:34:07Z department: - _id: '54' doi: 10.1109/slt48900.2021.9383514 language: - iso: eng publication: 2021 IEEE Spoken Language Technology Workshop (SLT) publication_status: published status: public title: Dual-Path RNN for Long Recording Speech Separation type: conference user_id: '40767' year: '2021' ... --- _id: '44843' abstract: - lang: eng text: "Unsupervised blind source separation methods do not require a training phase\r\nand thus cannot suffer from a train-test mismatch, which is a common concern in\r\nneural network based source separation. The unsupervised techniques can be\r\ncategorized in two classes, those building upon the sparsity of speech in the\r\nShort-Time Fourier transform domain and those exploiting non-Gaussianity or\r\nnon-stationarity of the source signals. In this contribution, spatial mixture\r\nmodels which fall in the first category and independent vector analysis (IVA)\r\nas a representative of the second category are compared w.r.t. their separation\r\nperformance and the performance of a downstream speech recognizer on a\r\nreverberant dataset of reasonable size. Furthermore, we introduce a serial\r\nconcatenation of the two, where the result of the mixture model serves as\r\ninitialization of IVA, which achieves significantly better WER performance than\r\neach algorithm individually and even approaches the performance of a much more\r\ncomplex neural network based technique." author: - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Frederik full_name: Rautenberg, Frederik id: '72602' last_name: Rautenberg - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'Boeddeker C, Rautenberg F, Haeb-Umbach R. A Comparison and Combination of Unsupervised Blind Source Separation  Techniques. In: ITG Conference on Speech Communication. ; 2021.' apa: Boeddeker, C., Rautenberg, F., & Haeb-Umbach, R. (2021). A Comparison and Combination of Unsupervised Blind Source Separation  Techniques. ITG Conference on Speech Communication. ITG Conference on Speech Communication, Kiel. bibtex: '@inproceedings{Boeddeker_Rautenberg_Haeb-Umbach_2021, title={A Comparison and Combination of Unsupervised Blind Source Separation  Techniques}, booktitle={ITG Conference on Speech Communication}, author={Boeddeker, Christoph and Rautenberg, Frederik and Haeb-Umbach, Reinhold}, year={2021} }' chicago: Boeddeker, Christoph, Frederik Rautenberg, and Reinhold Haeb-Umbach. “A Comparison and Combination of Unsupervised Blind Source Separation  Techniques.” In ITG Conference on Speech Communication, 2021. ieee: C. Boeddeker, F. Rautenberg, and R. Haeb-Umbach, “A Comparison and Combination of Unsupervised Blind Source Separation  Techniques,” presented at the ITG Conference on Speech Communication, Kiel, 2021. mla: Boeddeker, Christoph, et al. “A Comparison and Combination of Unsupervised Blind Source Separation  Techniques.” ITG Conference on Speech Communication, 2021. short: 'C. Boeddeker, F. Rautenberg, R. Haeb-Umbach, in: ITG Conference on Speech Communication, 2021.' conference: location: Kiel name: ITG Conference on Speech Communication date_created: 2023-05-15T07:59:33Z date_updated: 2023-11-15T15:29:32Z ddc: - '000' department: - _id: '54' external_id: arxiv: - '2106.05627' file: - access_level: open_access content_type: application/pdf creator: frra date_created: 2023-05-16T08:37:31Z date_updated: 2023-11-15T15:29:32Z file_id: '44856' file_name: 2106.05627.pdf file_size: 295972 relation: main_file file_date_updated: 2023-11-15T15:29:32Z has_accepted_license: '1' language: - iso: eng main_file_link: - open_access: '1' url: https://arxiv.org/pdf/2106.05627.pdf oa: '1' publication: ITG Conference on Speech Communication status: public title: A Comparison and Combination of Unsupervised Blind Source Separation Techniques type: conference user_id: '40767' year: '2021' ... --- _id: '28259' author: - first_name: Christoph full_name: Boeddeker, Christoph id: '40767' last_name: Boeddeker - first_name: Wangyou full_name: Zhang, Wangyou last_name: Zhang - first_name: Tomohiro full_name: Nakatani, Tomohiro last_name: Nakatani - first_name: Keisuke full_name: Kinoshita, Keisuke last_name: Kinoshita - first_name: Tsubasa full_name: Ochiai, Tsubasa last_name: Ochiai - first_name: Marc full_name: Delcroix, Marc last_name: Delcroix - first_name: Naoyuki full_name: Kamo, Naoyuki last_name: Kamo - first_name: Yanmin full_name: Qian, Yanmin last_name: Qian - first_name: Reinhold full_name: Haeb-Umbach, Reinhold id: '242' last_name: Haeb-Umbach citation: ama: 'Boeddeker C, Zhang W, Nakatani T, et al. Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation. In: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). ; 2021. doi:10.1109/icassp39728.2021.9414661' apa: Boeddeker, C., Zhang, W., Nakatani, T., Kinoshita, K., Ochiai, T., Delcroix, M., Kamo, N., Qian, Y., & Haeb-Umbach, R. (2021). Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation. ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). https://doi.org/10.1109/icassp39728.2021.9414661 bibtex: '@inproceedings{Boeddeker_Zhang_Nakatani_Kinoshita_Ochiai_Delcroix_Kamo_Qian_Haeb-Umbach_2021, title={Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation}, DOI={10.1109/icassp39728.2021.9414661}, booktitle={ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={Boeddeker, Christoph and Zhang, Wangyou and Nakatani, Tomohiro and Kinoshita, Keisuke and Ochiai, Tsubasa and Delcroix, Marc and Kamo, Naoyuki and Qian, Yanmin and Haeb-Umbach, Reinhold}, year={2021} }' chicago: Boeddeker, Christoph, Wangyou Zhang, Tomohiro Nakatani, Keisuke Kinoshita, Tsubasa Ochiai, Marc Delcroix, Naoyuki Kamo, Yanmin Qian, and Reinhold Haeb-Umbach. “Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation.” In ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021. https://doi.org/10.1109/icassp39728.2021.9414661. ieee: 'C. Boeddeker et al., “Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation,” 2021, doi: 10.1109/icassp39728.2021.9414661.' mla: Boeddeker, Christoph, et al. “Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation.” ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021, doi:10.1109/icassp39728.2021.9414661. short: 'C. Boeddeker, W. Zhang, T. Nakatani, K. Kinoshita, T. Ochiai, M. Delcroix, N. Kamo, Y. Qian, R. Haeb-Umbach, in: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021.' date_created: 2021-12-03T12:00:16Z date_updated: 2023-11-15T15:18:09Z ddc: - '000' department: - _id: '54' doi: 10.1109/icassp39728.2021.9414661 file: - access_level: open_access content_type: application/pdf creator: cbj date_created: 2021-12-03T12:01:20Z date_updated: 2023-11-15T15:18:08Z file_id: '28260' file_name: ICASSP2021_BSSEval.pdf file_size: 228717 relation: main_file file_date_updated: 2023-11-15T15:18:08Z has_accepted_license: '1' language: - iso: eng oa: '1' project: - _id: '52' name: 'PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing' publication: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) publication_status: published status: public title: Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation type: conference user_id: '40767' year: '2021' ...