@article{52958, author = {{Boeddeker, Christoph and Subramanian, Aswin Shanmugam and Wichern, Gordon and Haeb-Umbach, Reinhold and Le Roux, Jonathan}}, issn = {{2329-9290}}, journal = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}}, keywords = {{Electrical and Electronic Engineering, Acoustics and Ultrasonics, Computer Science (miscellaneous), Computational Mathematics}}, pages = {{1185--1197}}, publisher = {{Institute of Electrical and Electronics Engineers (IEEE)}}, title = {{{TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings}}}, doi = {{10.1109/taslp.2024.3350887}}, volume = {{32}}, year = {{2024}}, } @inproceedings{47128, author = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}}, booktitle = {{ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}}, location = {{Rhodes}}, publisher = {{IEEE}}, title = {{{Frame-Wise and Overlap-Robust Speaker Embeddings for Meeting Diarization}}}, doi = {{10.1109/icassp49357.2023.10095370}}, year = {{2023}}, } @inproceedings{47129, author = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}}, booktitle = {{INTERSPEECH 2023}}, publisher = {{ISCA}}, title = {{{A Teacher-Student Approach for Extracting Informative Speaker Embeddings From Speech Mixtures}}}, doi = {{10.21437/interspeech.2023-1379}}, year = {{2023}}, } @inproceedings{48391, author = {{Aralikatti, Rohith and Boeddeker, Christoph and Wichern, Gordon and Subramanian, Aswin and Le Roux, Jonathan}}, booktitle = {{ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}}, publisher = {{IEEE}}, title = {{{Reverberation as Supervision For Speech Separation}}}, doi = {{10.1109/icassp49357.2023.10095022}}, year = {{2023}}, } @inproceedings{48390, author = {{Berger, Simon and Vieting, Peter and Boeddeker, Christoph and Schlüter, Ralf and Haeb-Umbach, Reinhold}}, booktitle = {{INTERSPEECH 2023}}, publisher = {{ISCA}}, title = {{{Mixture Encoder for Joint Speech Separation and Recognition}}}, doi = {{10.21437/interspeech.2023-1815}}, year = {{2023}}, } @article{35602, abstract = {{Continuous Speech Separation (CSS) has been proposed to address speech overlaps during the analysis of realistic meeting-like conversations by eliminating any overlaps before further processing. CSS separates a recording of arbitrarily many speakers into a small number of overlap-free output channels, where each output channel may contain speech of multiple speakers. This is often done by applying a conventional separation model trained with Utterance-level Permutation Invariant Training (uPIT), which exclusively maps a speaker to an output channel, in sliding window approach called stitching. Recently, we introduced an alternative training scheme called Graph-PIT that teaches the separation network to directly produce output streams in the required format without stitching. It can handle an arbitrary number of speakers as long as never more of them overlap at the same time than the separator has output channels. In this contribution, we further investigate the Graph-PIT training scheme. We show in extended experiments that models trained with Graph-PIT also work in challenging reverberant conditions. Models trained in this way are able to perform segment-less CSS, i.e., without stitching, and achieve comparable and often better separation quality than the conventional CSS with uPIT and stitching. We simplify the training schedule for Graph-PIT with the recently proposed Source Aggregated Signal-to-Distortion Ratio (SA-SDR) loss. It eliminates unfavorable properties of the previously used A-SDR loss and thus enables training with Graph-PIT from scratch. Graph-PIT training relaxes the constraints w.r.t. the allowed numbers of speakers and speaking patterns which allows using a larger variety of training data. Furthermore, we introduce novel signal-level evaluation metrics for meeting scenarios, namely the source-aggregated scale- and convolution-invariant Signal-to-Distortion Ratio (SA-SI-SDR and SA-CI-SDR), which are generalizations of the commonly used SDR-based metrics for the CSS case.}}, author = {{von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}}, issn = {{2329-9290}}, journal = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}}, keywords = {{Continuous Speech Separation, Source Separation, Graph-PIT, Dynamic Programming, Permutation Invariant Training}}, pages = {{576--589}}, publisher = {{Institute of Electrical and Electronics Engineers (IEEE)}}, title = {{{Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria}}}, doi = {{10.1109/taslp.2022.3228629}}, volume = {{31}}, year = {{2023}}, } @inproceedings{48281, abstract = {{ We propose a general framework to compute the word error rate (WER) of ASR systems that process recordings containing multiple speakers at their input and that produce multiple output word sequences (MIMO). Such ASR systems are typically required, e.g., for meeting transcription. We provide an efficient implementation based on a dynamic programming search in a multi-dimensional Levenshtein distance tensor under the constraint that a reference utterance must be matched consistently with one hypothesis output. This also results in an efficient implementation of the ORC WER which previously suffered from exponential complexity. We give an overview of commonly used WER definitions for multi-speaker scenarios and show that they are specializations of the above MIMO WER tuned to particular application scenarios. We conclude with a discussion of the pros and cons of the various WER definitions and a recommendation when to use which.}}, author = {{von Neumann, Thilo and Boeddeker, Christoph and Kinoshita, Keisuke and Delcroix, Marc and Haeb-Umbach, Reinhold}}, booktitle = {{ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}}, keywords = {{Word Error Rate, Meeting Recognition, Levenshtein Distance}}, publisher = {{IEEE}}, title = {{{On Word Error Rate Definitions and Their Efficient Computation for Multi-Speaker Speech Recognition Systems}}}, doi = {{10.1109/icassp49357.2023.10094784}}, year = {{2023}}, } @inproceedings{48275, abstract = {{MeetEval is an open-source toolkit to evaluate all kinds of meeting transcription systems. It provides a unified interface for the computation of commonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER along other WER definitions. We extend the cpWER computation by a temporal constraint to ensure that only words are identified as correct when the temporal alignment is plausible. This leads to a better quality of the matching of the hypothesis string to the reference string that more closely resembles the actual transcription quality, and a system is penalized if it provides poor time annotations. Since word-level timing information is often not available, we present a way to approximate exact word-level timings from segment-level timings (e.g., a sentence) and show that the approximation leads to a similar WER as a matching with exact word-level annotations. At the same time, the time constraint leads to a speedup of the matching algorithm, which outweighs the additional overhead caused by processing the time stamps.}}, author = {{von Neumann, Thilo and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}}, booktitle = {{Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments}}, keywords = {{Speech Recognition, Word Error Rate, Meeting Transcription}}, location = {{Dublin}}, title = {{{MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems}}}, year = {{2023}}, } @article{33669, abstract = {{Far-field multi-speaker automatic speech recognition (ASR) has drawn increasing attention in recent years. Most existing methods feature a signal processing frontend and an ASR backend. In realistic scenarios, these modules are usually trained separately or progressively, which suffers from either inter-module mismatch or a complicated training process. In this paper, we propose an end-to-end multi-channel model that jointly optimizes the speech enhancement (including speech dereverberation, denoising, and separation) frontend and the ASR backend as a single system. To the best of our knowledge, this is the first work that proposes to optimize dereverberation, beamforming, and multi-speaker ASR in a fully end-to-end manner. The frontend module consists of a weighted prediction error (WPE) based submodule for dereverberation and a neural beamformer for denoising and speech separation. For the backend, we adopt a widely used end-to-end (E2E) ASR architecture. It is worth noting that the entire model is differentiable and can be optimized in a fully end-to-end manner using only the ASR criterion, without the need of parallel signal-level labels. We evaluate the proposed model on several multi-speaker benchmark datasets, and experimental results show that the fully E2E ASR model can achieve competitive performance on both noisy and reverberant conditions, with over 30% relative word error rate (WER) reduction over the single-channel baseline systems.}}, author = {{Zhang, Wangyou and Chang, Xuankai and Boeddeker, Christoph and Nakatani, Tomohiro and Watanabe, Shinji and Qian, Yanmin}}, issn = {{Print ISSN: 2329-9290 Electronic ISSN: 2329-9304}}, journal = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}}, title = {{{End-to-End Dereverberation, Beamforming, and Speech Recognition in A Cocktail Party}}}, doi = {{10.1109/TASLP.2022.3209942}}, year = {{2022}}, } @inproceedings{33954, author = {{Boeddeker, Christoph and Cord-Landwehr, Tobias and von Neumann, Thilo and Haeb-Umbach, Reinhold}}, booktitle = {{Interspeech 2022}}, publisher = {{ISCA}}, title = {{{An Initialization Scheme for Meeting Separation with Spatial Mixture Models}}}, doi = {{10.21437/interspeech.2022-10929}}, year = {{2022}}, } @inproceedings{33958, abstract = {{Recent speaker diarization studies showed that integration of end-to-end neural diarization (EEND) and clustering-based diarization is a promising approach for achieving state-of-the-art performance on various tasks. Such an approach first divides an observed signal into fixed-length segments, then performs {\it segment-level} local diarization based on an EEND module, and merges the segment-level results via clustering to form a final global diarization result. The segmentation is done to limit the number of speakers in each segment since the current EEND cannot handle a large number of speakers. In this paper, we argue that such an approach involving the segmentation has several issues; for example, it inevitably faces a dilemma that larger segment sizes increase both the context available for enhancing the performance and the number of speakers for the local EEND module to handle. To resolve such a problem, this paper proposes a novel framework that performs diarization without segmentation. However, it can still handle challenging data containing many speakers and a significant amount of overlapping speech. The proposed method can take an entire meeting for inference and perform {\it utterance-by-utterance} diarization that clusters utterance activities in terms of speakers. To this end, we leverage a neural network training scheme called Graph-PIT proposed recently for neural source separation. Experiments with simulated active-meeting-like data and CALLHOME data show the superiority of the proposed approach over the conventional methods.}}, author = {{Kinoshita, Keisuke and von Neumann, Thilo and Delcroix, Marc and Boeddeker, Christoph and Haeb-Umbach, Reinhold}}, booktitle = {{Proc. Interspeech 2022}}, pages = {{1486--1490}}, publisher = {{ISCA}}, title = {{{Utterance-by-utterance overlap-aware neural diarization with Graph-PIT}}}, doi = {{10.21437/Interspeech.2022-11408}}, year = {{2022}}, } @inproceedings{33819, author = {{von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}}, booktitle = {{ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}}, publisher = {{IEEE}}, title = {{{SA-SDR: A Novel Loss Function for Separation of Meeting Style Data}}}, doi = {{10.1109/icassp43922.2022.9746757}}, year = {{2022}}, } @inproceedings{33847, abstract = {{The scope of speech enhancement has changed from a monolithic view of single, independent tasks, to a joint processing of complex conversational speech recordings. Training and evaluation of these single tasks requires synthetic data with access to intermediate signals that is as close as possible to the evaluation scenario. As such data often is not available, many works instead use specialized databases for the training of each system component, e.g WSJ0-mix for source separation. We present a Multi-purpose Multi-Speaker Mixture Signal Generator (MMS-MSG) for generating a variety of speech mixture signals based on any speech corpus, ranging from classical anechoic mixtures (e.g., WSJ0-mix) over reverberant mixtures (e.g., SMS-WSJ) to meeting-style data. Its highly modular and flexible structure allows for the simulation of diverse environments and dynamic mixing, while simultaneously enabling an easy extension and modification to generate new scenarios and mixture types. These meetings can be used for prototyping, evaluation, or training purposes. We provide example evaluation data and baseline results for meetings based on the WSJ corpus. Further, we demonstrate the usefulness for realistic scenarios by using MMS-MSG to provide training data for the LibriCSS database.}}, author = {{Cord-Landwehr, Tobias and von Neumann, Thilo and Boeddeker, Christoph and Haeb-Umbach, Reinhold}}, booktitle = {{2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}}, location = {{Bamberg}}, title = {{{MMS-MSG: A Multi-purpose Multi-Speaker Mixture Signal Generator}}}, year = {{2022}}, } @inproceedings{33848, abstract = {{Impressive progress in neural network-based single-channel speech source separation has been made in recent years. But those improvements have been mostly reported on anechoic data, a situation that is hardly met in practice. Taking the SepFormer as a starting point, which achieves state-of-the-art performance on anechoic mixtures, we gradually modify it to optimize its performance on reverberant mixtures. Although this leads to a word error rate improvement by 7 percentage points compared to the standard SepFormer implementation, the system ends up with only marginally better performance than a PIT-BLSTM separation system, that is optimized with rather straightforward means. This is surprising and at the same time sobering, challenging the practical usefulness of many improvements reported in recent years for monaural source separation on nonreverberant data.}}, author = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and von Neumann, Thilo and Zorila, Catalin and Doddipatla, Rama and Haeb-Umbach, Reinhold}}, booktitle = {{2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}}, publisher = {{IEEE}}, title = {{{Monaural source separation: From anechoic to reverberant environments}}}, year = {{2022}}, } @misc{33816, author = {{Gburrek, Tobias and Boeddeker, Christoph and von Neumann, Thilo and Cord-Landwehr, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}}, publisher = {{arXiv}}, title = {{{A Meeting Transcription System for an Ad-Hoc Acoustic Sensor Network}}}, doi = {{10.48550/ARXIV.2205.00944}}, year = {{2022}}, } @inproceedings{28256, author = {{Zhang, Wangyou and Boeddeker, Christoph and Watanabe, Shinji and Nakatani, Tomohiro and Delcroix, Marc and Kinoshita, Keisuke and Ochiai, Tsubasa and Kamo, Naoyuki and Haeb-Umbach, Reinhold and Qian, Yanmin}}, booktitle = {{ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}}, title = {{{End-to-End Dereverberation, Beamforming, and Speech Recognition with Improved Numerical Stability and Advanced Frontend}}}, doi = {{10.1109/icassp39728.2021.9414464}}, year = {{2021}}, } @inproceedings{28262, author = {{Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji}}, booktitle = {{2021 IEEE Spoken Language Technology Workshop (SLT)}}, title = {{{ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for ASR Integration}}}, doi = {{10.1109/slt48900.2021.9383615}}, year = {{2021}}, } @inproceedings{28261, author = {{Li, Chenda and Luo, Yi and Han, Cong and Li, Jinyu and Yoshioka, Takuya and Zhou, Tianyan and Delcroix, Marc and Kinoshita, Keisuke and Boeddeker, Christoph and Qian, Yanmin and Watanabe, Shinji and Chen, Zhuo}}, booktitle = {{2021 IEEE Spoken Language Technology Workshop (SLT)}}, title = {{{Dual-Path RNN for Long Recording Speech Separation}}}, doi = {{10.1109/slt48900.2021.9383514}}, year = {{2021}}, } @inproceedings{44843, abstract = {{Unsupervised blind source separation methods do not require a training phase and thus cannot suffer from a train-test mismatch, which is a common concern in neural network based source separation. The unsupervised techniques can be categorized in two classes, those building upon the sparsity of speech in the Short-Time Fourier transform domain and those exploiting non-Gaussianity or non-stationarity of the source signals. In this contribution, spatial mixture models which fall in the first category and independent vector analysis (IVA) as a representative of the second category are compared w.r.t. their separation performance and the performance of a downstream speech recognizer on a reverberant dataset of reasonable size. Furthermore, we introduce a serial concatenation of the two, where the result of the mixture model serves as initialization of IVA, which achieves significantly better WER performance than each algorithm individually and even approaches the performance of a much more complex neural network based technique.}}, author = {{Boeddeker, Christoph and Rautenberg, Frederik and Haeb-Umbach, Reinhold}}, booktitle = {{ITG Conference on Speech Communication}}, location = {{Kiel}}, title = {{{A Comparison and Combination of Unsupervised Blind Source Separation Techniques}}}, year = {{2021}}, } @inproceedings{28259, author = {{Boeddeker, Christoph and Zhang, Wangyou and Nakatani, Tomohiro and Kinoshita, Keisuke and Ochiai, Tsubasa and Delcroix, Marc and Kamo, Naoyuki and Qian, Yanmin and Haeb-Umbach, Reinhold}}, booktitle = {{ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}}, title = {{{Convolutive Transfer Function Invariant SDR Training Criteria for Multi-Channel Reverberant Speech Separation}}}, doi = {{10.1109/icassp39728.2021.9414661}}, year = {{2021}}, }