@article{52958, author = {{Boeddeker, Christoph and Subramanian, Aswin Shanmugam and Wichern, Gordon and Haeb-Umbach, Reinhold and Le Roux, Jonathan}}, issn = {{2329-9290}}, journal = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}}, keywords = {{Electrical and Electronic Engineering, Acoustics and Ultrasonics, Computer Science (miscellaneous), Computational Mathematics}}, pages = {{1185--1197}}, publisher = {{Institute of Electrical and Electronics Engineers (IEEE)}}, title = {{{TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings}}}, doi = {{10.1109/taslp.2024.3350887}}, volume = {{32}}, year = {{2024}}, } @inproceedings{48269, author = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}}, booktitle = {{European Signal Processing Conference (EUSIPCO)}}, location = {{Helsinki}}, title = {{{On the Integration of Sampling Rate Synchronization and Acoustic Beamforming}}}, year = {{2023}}, } @inproceedings{47128, author = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}}, booktitle = {{ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}}, location = {{Rhodes}}, publisher = {{IEEE}}, title = {{{Frame-Wise and Overlap-Robust Speaker Embeddings for Meeting Diarization}}}, doi = {{10.1109/icassp49357.2023.10095370}}, year = {{2023}}, } @inproceedings{48270, author = {{Schmalenstroeer, Joerg and Gburrek, Tobias and Haeb-Umbach, Reinhold}}, booktitle = {{ITG Conference on Speech Communication}}, location = {{Aachen}}, title = {{{LibriWASN: A Data Set for Meeting Separation, Diarization, and Recognition with Asynchronous Recording Devices}}}, year = {{2023}}, } @inproceedings{47129, author = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}}, booktitle = {{INTERSPEECH 2023}}, publisher = {{ISCA}}, title = {{{A Teacher-Student Approach for Extracting Informative Speaker Embeddings From Speech Mixtures}}}, doi = {{10.21437/interspeech.2023-1379}}, year = {{2023}}, } @inproceedings{48355, abstract = {{Unsupervised speech disentanglement aims at separating fast varying from slowly varying components of a speech signal. In this contribution, we take a closer look at the embedding vector representing the slowly varying signal components, commonly named the speaker embedding vector. We ask, which properties of a speaker's voice are captured and investigate to which extent do individual embedding vector components sign responsible for them, using the concept of Shapley values. Our findings show that certain speaker-specific acoustic-phonetic properties can be fairly well predicted from the speaker embedding, while the investigated more abstract voice quality features cannot.}}, author = {{Rautenberg, Frederik and Kuhlmann, Michael and Wiechmann, Jana and Seebauer, Fritz and Wagner, Petra and Haeb-Umbach, Reinhold}}, booktitle = {{ITG Conference on Speech Communication}}, location = {{Aachen}}, title = {{{On Feature Importance and Interpretability of Speaker Representations}}}, year = {{2023}}, } @inproceedings{48410, author = {{Wiechmann, Jana and Rautenberg, Frederik and Wagner, Petra and Haeb-Umbach, Reinhold}}, booktitle = {{20th International Congress of the Phonetic Sciences (ICPhS) }}, title = {{{Explaining voice characteristics to novice voice practitioners-How successful is it?}}}, year = {{2023}}, } @inproceedings{48390, author = {{Berger, Simon and Vieting, Peter and Boeddeker, Christoph and Schlüter, Ralf and Haeb-Umbach, Reinhold}}, booktitle = {{INTERSPEECH 2023}}, publisher = {{ISCA}}, title = {{{Mixture Encoder for Joint Speech Separation and Recognition}}}, doi = {{10.21437/interspeech.2023-1815}}, year = {{2023}}, } @inproceedings{46069, author = {{Seebauer, Fritz and Kuhlmann, Michael and Haeb-Umbach, Reinhold and Wagner, Petra}}, booktitle = {{12th Speech Synthesis Workshop (SSW) 2023}}, title = {{{Re-examining the quality dimensions of synthetic speech}}}, year = {{2023}}, } @article{35602, abstract = {{Continuous Speech Separation (CSS) has been proposed to address speech overlaps during the analysis of realistic meeting-like conversations by eliminating any overlaps before further processing. CSS separates a recording of arbitrarily many speakers into a small number of overlap-free output channels, where each output channel may contain speech of multiple speakers. This is often done by applying a conventional separation model trained with Utterance-level Permutation Invariant Training (uPIT), which exclusively maps a speaker to an output channel, in sliding window approach called stitching. Recently, we introduced an alternative training scheme called Graph-PIT that teaches the separation network to directly produce output streams in the required format without stitching. It can handle an arbitrary number of speakers as long as never more of them overlap at the same time than the separator has output channels. In this contribution, we further investigate the Graph-PIT training scheme. We show in extended experiments that models trained with Graph-PIT also work in challenging reverberant conditions. Models trained in this way are able to perform segment-less CSS, i.e., without stitching, and achieve comparable and often better separation quality than the conventional CSS with uPIT and stitching. We simplify the training schedule for Graph-PIT with the recently proposed Source Aggregated Signal-to-Distortion Ratio (SA-SDR) loss. It eliminates unfavorable properties of the previously used A-SDR loss and thus enables training with Graph-PIT from scratch. Graph-PIT training relaxes the constraints w.r.t. the allowed numbers of speakers and speaking patterns which allows using a larger variety of training data. Furthermore, we introduce novel signal-level evaluation metrics for meeting scenarios, namely the source-aggregated scale- and convolution-invariant Signal-to-Distortion Ratio (SA-SI-SDR and SA-CI-SDR), which are generalizations of the commonly used SDR-based metrics for the CSS case.}}, author = {{von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}}, issn = {{2329-9290}}, journal = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}}, keywords = {{Continuous Speech Separation, Source Separation, Graph-PIT, Dynamic Programming, Permutation Invariant Training}}, pages = {{576--589}}, publisher = {{Institute of Electrical and Electronics Engineers (IEEE)}}, title = {{{Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria}}}, doi = {{10.1109/taslp.2022.3228629}}, volume = {{31}}, year = {{2023}}, } @inproceedings{48281, abstract = {{ We propose a general framework to compute the word error rate (WER) of ASR systems that process recordings containing multiple speakers at their input and that produce multiple output word sequences (MIMO). Such ASR systems are typically required, e.g., for meeting transcription. We provide an efficient implementation based on a dynamic programming search in a multi-dimensional Levenshtein distance tensor under the constraint that a reference utterance must be matched consistently with one hypothesis output. This also results in an efficient implementation of the ORC WER which previously suffered from exponential complexity. We give an overview of commonly used WER definitions for multi-speaker scenarios and show that they are specializations of the above MIMO WER tuned to particular application scenarios. We conclude with a discussion of the pros and cons of the various WER definitions and a recommendation when to use which.}}, author = {{von Neumann, Thilo and Boeddeker, Christoph and Kinoshita, Keisuke and Delcroix, Marc and Haeb-Umbach, Reinhold}}, booktitle = {{ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}}, keywords = {{Word Error Rate, Meeting Recognition, Levenshtein Distance}}, publisher = {{IEEE}}, title = {{{On Word Error Rate Definitions and Their Efficient Computation for Multi-Speaker Speech Recognition Systems}}}, doi = {{10.1109/icassp49357.2023.10094784}}, year = {{2023}}, } @inproceedings{48275, abstract = {{MeetEval is an open-source toolkit to evaluate all kinds of meeting transcription systems. It provides a unified interface for the computation of commonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER along other WER definitions. We extend the cpWER computation by a temporal constraint to ensure that only words are identified as correct when the temporal alignment is plausible. This leads to a better quality of the matching of the hypothesis string to the reference string that more closely resembles the actual transcription quality, and a system is penalized if it provides poor time annotations. Since word-level timing information is often not available, we present a way to approximate exact word-level timings from segment-level timings (e.g., a sentence) and show that the approximation leads to a similar WER as a matching with exact word-level annotations. At the same time, the time constraint leads to a speedup of the matching algorithm, which outweighs the additional overhead caused by processing the time stamps.}}, author = {{von Neumann, Thilo and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}}, booktitle = {{Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments}}, keywords = {{Speech Recognition, Word Error Rate, Meeting Transcription}}, location = {{Dublin}}, title = {{{MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems}}}, year = {{2023}}, } @inproceedings{49109, abstract = {{We propose a diarization system, that estimates “who spoke when” based on spatial information, to be used as a front-end of a meeting transcription system running on the signals gathered from an acoustic sensor network (ASN). Although the spatial distribution of the microphones is advantageous, exploiting the spatial diversity for diarization and signal enhancement is challenging, because the microphones’ positions are typically unknown, and the recorded signals are initially unsynchronized in general. Here, we approach these issues by first blindly synchronizing the signals and then estimating time differences of arrival (TDOAs). The TDOA information is exploited to estimate the speakers’ activity, even in the presence of multiple speakers being simultaneously active. This speaker activity information serves as a guide for a spatial mixture model, on which basis the individual speaker’s signals are extracted via beamforming. Finally, the extracted signals are forwarded to a speech recognizer. Additionally, a novel initialization scheme for spatial mixture models based on the TDOA estimates is proposed. Experiments conducted on real recordings from the LibriWASN data set have shown that our proposed system is advantageous compared to a system using a spatial mixture model, which does not make use of external diarization information.}}, author = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}}, booktitle = {{Proc. Asilomar Conference on Signals, Systems, and Computers}}, keywords = {{Diarization, time difference of arrival, ad-hoc acoustic sensor network, meeting transcription}}, title = {{{Spatial Diarization for Meeting Transcription with Ad-Hoc Acoustic Sensor Networks}}}, year = {{2023}}, } @inproceedings{44849, author = {{Rautenberg, Frederik and Kuhlmann, Michael and Ebbers, Janek and Wiechmann, Jana and Seebauer, Fritz and Wagner, Petra and Haeb-Umbach, Reinhold}}, booktitle = {{Fortschritte der Akustik - DAGA 2023}}, location = {{Hamburg}}, pages = {{1409--1412}}, title = {{{Speech Disentanglement for Analysis and Modification of Acoustic and Perceptual Speaker Characteristics}}}, year = {{2023}}, } @inproceedings{33954, author = {{Boeddeker, Christoph and Cord-Landwehr, Tobias and von Neumann, Thilo and Haeb-Umbach, Reinhold}}, booktitle = {{Interspeech 2022}}, publisher = {{ISCA}}, title = {{{An Initialization Scheme for Meeting Separation with Spatial Mixture Models}}}, doi = {{10.21437/interspeech.2022-10929}}, year = {{2022}}, } @inproceedings{33471, abstract = {{The intelligibility of demodulated audio signals from analog high frequency transmissions, e.g., using single-sideband (SSB) modulation, can be severely degraded by channel distortions and/or a mismatch between modulation and demodulation carrier frequency. In this work a neural network (NN)-based approach for carrier frequency offset (CFO) estimation from demodulated SSB signals is proposed, whereby a task specific architecture is presented. Additionally, a simulation framework for SSB signals is introduced and utilized for training the NNs. The CFO estimator is combined with a speech enhancement network to investigate its influence on the enhancement performance. The NN-based system is compared to a recently proposed pitch tracking based approach on publicly available data from real high frequency transmissions. Experiments show that the NN exhibits good CFO estimation properties and results in significant improvements in speech intelligibility, especially when combined with a noise reduction network.}}, author = {{Heitkämper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}}, booktitle = {{Proceedings of the 30th European Signal Processing Conference (EUSIPCO)}}, location = {{Belgrad}}, title = {{{Neural Network Based Carrier Frequency Offset Estimation From Speech Transmitted Over High Frequency Channels}}}, year = {{2022}}, } @inproceedings{33958, abstract = {{Recent speaker diarization studies showed that integration of end-to-end neural diarization (EEND) and clustering-based diarization is a promising approach for achieving state-of-the-art performance on various tasks. Such an approach first divides an observed signal into fixed-length segments, then performs {\it segment-level} local diarization based on an EEND module, and merges the segment-level results via clustering to form a final global diarization result. The segmentation is done to limit the number of speakers in each segment since the current EEND cannot handle a large number of speakers. In this paper, we argue that such an approach involving the segmentation has several issues; for example, it inevitably faces a dilemma that larger segment sizes increase both the context available for enhancing the performance and the number of speakers for the local EEND module to handle. To resolve such a problem, this paper proposes a novel framework that performs diarization without segmentation. However, it can still handle challenging data containing many speakers and a significant amount of overlapping speech. The proposed method can take an entire meeting for inference and perform {\it utterance-by-utterance} diarization that clusters utterance activities in terms of speakers. To this end, we leverage a neural network training scheme called Graph-PIT proposed recently for neural source separation. Experiments with simulated active-meeting-like data and CALLHOME data show the superiority of the proposed approach over the conventional methods.}}, author = {{Kinoshita, Keisuke and von Neumann, Thilo and Delcroix, Marc and Boeddeker, Christoph and Haeb-Umbach, Reinhold}}, booktitle = {{Proc. Interspeech 2022}}, pages = {{1486--1490}}, publisher = {{ISCA}}, title = {{{Utterance-by-utterance overlap-aware neural diarization with Graph-PIT}}}, doi = {{10.21437/Interspeech.2022-11408}}, year = {{2022}}, } @inproceedings{33819, author = {{von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}}, booktitle = {{ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}}, publisher = {{IEEE}}, title = {{{SA-SDR: A Novel Loss Function for Separation of Meeting Style Data}}}, doi = {{10.1109/icassp43922.2022.9746757}}, year = {{2022}}, } @inproceedings{33847, abstract = {{The scope of speech enhancement has changed from a monolithic view of single, independent tasks, to a joint processing of complex conversational speech recordings. Training and evaluation of these single tasks requires synthetic data with access to intermediate signals that is as close as possible to the evaluation scenario. As such data often is not available, many works instead use specialized databases for the training of each system component, e.g WSJ0-mix for source separation. We present a Multi-purpose Multi-Speaker Mixture Signal Generator (MMS-MSG) for generating a variety of speech mixture signals based on any speech corpus, ranging from classical anechoic mixtures (e.g., WSJ0-mix) over reverberant mixtures (e.g., SMS-WSJ) to meeting-style data. Its highly modular and flexible structure allows for the simulation of diverse environments and dynamic mixing, while simultaneously enabling an easy extension and modification to generate new scenarios and mixture types. These meetings can be used for prototyping, evaluation, or training purposes. We provide example evaluation data and baseline results for meetings based on the WSJ corpus. Further, we demonstrate the usefulness for realistic scenarios by using MMS-MSG to provide training data for the LibriCSS database.}}, author = {{Cord-Landwehr, Tobias and von Neumann, Thilo and Boeddeker, Christoph and Haeb-Umbach, Reinhold}}, booktitle = {{2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}}, location = {{Bamberg}}, title = {{{MMS-MSG: A Multi-purpose Multi-Speaker Mixture Signal Generator}}}, year = {{2022}}, } @inproceedings{33848, abstract = {{Impressive progress in neural network-based single-channel speech source separation has been made in recent years. But those improvements have been mostly reported on anechoic data, a situation that is hardly met in practice. Taking the SepFormer as a starting point, which achieves state-of-the-art performance on anechoic mixtures, we gradually modify it to optimize its performance on reverberant mixtures. Although this leads to a word error rate improvement by 7 percentage points compared to the standard SepFormer implementation, the system ends up with only marginally better performance than a PIT-BLSTM separation system, that is optimized with rather straightforward means. This is surprising and at the same time sobering, challenging the practical usefulness of many improvements reported in recent years for monaural source separation on nonreverberant data.}}, author = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and von Neumann, Thilo and Zorila, Catalin and Doddipatla, Rama and Haeb-Umbach, Reinhold}}, booktitle = {{2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}}, publisher = {{IEEE}}, title = {{{Monaural source separation: From anechoic to reverberant environments}}}, year = {{2022}}, }