@inproceedings{59900,
  abstract     = {{Running state-of-the-art large-scale audio models on edge devices is often infeasible due to their limited storage and computing resources. It is therefore necessary to compress and tune the models for the specific target task and hardware. This is commonly achieved by distilling the audio model, the teacher, to a small target model, the student. However, this approach can be improved by prepending a dataset pruning stage and training the teacher on the pruned data set only, which contains examples relevant to the target task. Recently, CLAP models have emerged that embed audio and text examples in a common embedding space. We use the audio embeddings of the CLAP model for the above pruning stage, which is realized using a domain classifier. After knowledge distillation, the student is eventually fine-tuned on some data from the target domain. The CLAP architecture combines text and audio embedding spaces, which allows to search for data given only a textual description, such as a class label. We show how this can help data pruning.}},
  author       = {{Werning, Alexander and Häb-Umbach, Reinhold}},
  booktitle    = {{Proceedings of DAS|DAGA 2025}},
  location     = {{Copenhagen}},
  title        = {{{Distilling Efficient Audio Models using Data Pruning with CLAP}}},
  doi          = {{10.71568/DASDAGA2025.149}},
  year         = {{2025}},
}

@inproceedings{59999,
  author       = {{Rautenberg, Frederik and Kuhlmann, Michael and Seebauer, Fritz and Wiechmann, Jana and Wagner, Petra and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  location     = {{Hyderabad, India }},
  publisher    = {{IEEE}},
  title        = {{{Speech Synthesis along Perceptual Voice Quality Dimensions}}},
  doi          = {{10.1109/icassp49660.2025.10888012}},
  year         = {{2025}},
}

@inproceedings{57031,
  author       = {{Gburrek, Tobias and Meise, Adrian and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{2024 18th International Workshop on Acoustic Signal Enhancement (IWAENC)}},
  publisher    = {{IEEE}},
  title        = {{{Diminishing Domain Mismatch for DNN-Based Acoustic Distance Estimation via Stochastic Room Reverberation Models}}},
  doi          = {{10.1109/iwaenc61483.2024.10694103}},
  year         = {{2024}},
}

@article{52958,
  author       = {{Boeddeker, Christoph and Subramanian, Aswin Shanmugam and Wichern, Gordon and Haeb-Umbach, Reinhold and Le Roux, Jonathan}},
  issn         = {{2329-9290}},
  journal      = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{Electrical and Electronic Engineering, Acoustics and Ultrasonics, Computer Science (miscellaneous), Computational Mathematics}},
  pages        = {{1185--1197}},
  publisher    = {{Institute of Electrical and Electronics Engineers (IEEE)}},
  title        = {{{TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings}}},
  doi          = {{10.1109/taslp.2024.3350887}},
  volume       = {{32}},
  year         = {{2024}},
}

@inproceedings{57085,
  abstract     = {{We propose an approach for simultaneous diarization and separation of meeting data. It consists of a complex Angular Central Gaussian Mixture Model (cACGMM) for speech source separation, and a von-Mises-Fisher Mixture Model (VMFMM) for diarization in a joint statistical framework. Through the integration, both spatial and spectral information are exploited for diarization and separation. We also develop a method for counting the number of active speakers in a segment of a meeting to support block-wise processing. While the total number of speakers in a meeting may be known, it is usually not known on a per-segment level. With the proposed speaker counting, joint diarization and source separation can be done segment-by-segment, and the permutation problem across segments is solved, thus allowing for block-online processing in the future. Experimental results on the LibriCSS meeting corpus show that the integrated approach outperforms a cascaded approach of diarization and speech enhancement in terms of WER, both on a per-segment and on a per-meeting level.}},
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  keywords     = {{diarization, source separation, mixture model, meeting}},
  location     = {{Hyderabad, India}},
  title        = {{{Simultaneous Diarization and Separation of Meetings through the Integration of Statistical Mixture Models}}},
  doi          = {{10.1109/ICASSP49660.2025.10888445}},
  year         = {{2024}},
}

@techreport{57161,
  author       = {{Werning, Alexander and Haeb-Umbach, Reinhold}},
  title        = {{{UPB-NT submission to DCASE24: Dataset pruning for targeted knowledge distillation}}},
  year         = {{2024}},
}

@inproceedings{57160,
  abstract     = {{Large audio tagging models are usually trained or pre-trained on AudioSet, a dataset that encompasses a large amount of different sound classes and acoustic environments. Knowledge distillation has emerged as a method to compress such models without compromising their effectiveness. There are many different applications for audio tagging, some of which require a specialization to a narrow domain of sounds to be classified. For these scenarios, it is beneficial to distill the large audio tagger with respect to a specific subset of sounds of interest. A method to prune a general dataset with respect to a target dataset is presented. By distilling with such a specialized pruned dataset, we obtain a compressed model with better classification accuracy in the specific target domain than with target-agnostic distillation.}},
  author       = {{Werning, Alexander and Haeb-Umbach, Reinhold}},
  booktitle    = {{32nd European Signal Processing Conference (EUSIPCO 2024)}},
  keywords     = {{data pruning, knowledge distillation, audio tagging}},
  location     = {{Lyon}},
  title        = {{{Target-Specific Dataset Pruning for Compression of Audio Tagging Models}}},
  year         = {{2024}},
}

@inproceedings{57099,
  author       = {{Xie, Yuying and Kuhlmann, Michael and Rautenberg, Frederik and Tan, Zheng-Hua and Häb-Umbach, Reinhold}},
  booktitle    = {{2024 32nd European Signal Processing Conference (EUSIPCO)}},
  pages        = {{436–440}},
  title        = {{{Speaker and Style Disentanglement of Speech Based on Contrastive Predictive Coding Supported Factorized Variational Autoencoder}}},
  year         = {{2024}},
}

@inproceedings{56004,
  author       = {{von Neumann, Thilo and Boeddeker, Christoph and Cord-Landwehr, Tobias and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)}},
  publisher    = {{IEEE}},
  title        = {{{Meeting Recognition with Continuous Speech Separation and Transcription-Supported Diarization}}},
  doi          = {{10.1109/icasspw62465.2024.10625894}},
  year         = {{2024}},
}

@inproceedings{53659,
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  location     = {{Seoul}},
  publisher    = {{IEEE}},
  title        = {{{Geodesic Interpolation of Frame-Wise Speaker Embeddings for the Diarization of Meeting Scenarios}}},
  doi          = {{10.1109/icassp48485.2024.10445911}},
  year         = {{2024}},
}

@inproceedings{56272,
  author       = {{Boeddeker, Christoph and Cord-Landwehr, Tobias and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2024}},
  publisher    = {{ISCA}},
  title        = {{{Once more Diarization: Improving meeting transcription systems through segment-level speaker reassignment}}},
  doi          = {{10.21437/interspeech.2024-1286}},
  year         = {{2024}},
}

@inproceedings{57659,
  author       = {{Vieting, Peter and Berger, Simon and von Neumann, Thilo and Boeddeker, Christoph and Schlüter, Ralf and Haeb-Umbach, Reinhold}},
  booktitle    = {{2024 IEEE Spoken Language Technology Workshop (SLT)}},
  title        = {{{Combining TF-GridNet and Mixture Encoder for Continuous Speech Separation for Meeting Transcription}}},
  year         = {{2024}},
}

@inproceedings{48269,
  author       = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{European Signal Processing Conference (EUSIPCO)}},
  location     = {{Helsinki}},
  title        = {{{On the Integration of Sampling Rate Synchronization and Acoustic Beamforming}}},
  year         = {{2023}},
}

@inproceedings{48270,
  author       = {{Schmalenstroeer, Joerg and Gburrek, Tobias and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG Conference on Speech Communication}},
  location     = {{Aachen}},
  title        = {{{LibriWASN: A Data Set for Meeting Separation, Diarization, and Recognition with Asynchronous Recording Devices}}},
  year         = {{2023}},
}

@inproceedings{48355,
  abstract     = {{Unsupervised speech disentanglement aims at separating fast varying from
slowly varying components of a speech signal. In this contribution, we take a
closer look at the embedding vector representing the slowly varying signal
components, commonly named the speaker embedding vector. We ask, which
properties of a speaker's voice are captured and investigate to which extent do
individual embedding vector components sign responsible for them, using the
concept of Shapley values. Our findings show that certain speaker-specific
acoustic-phonetic properties can be fairly well predicted from the speaker
embedding, while the investigated more abstract voice quality features cannot.}},
  author       = {{Rautenberg, Frederik and Kuhlmann, Michael and Wiechmann, Jana and Seebauer, Fritz and Wagner, Petra and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG Conference on Speech Communication}},
  location     = {{Aachen}},
  title        = {{{On Feature Importance and Interpretability of Speaker Representations}}},
  year         = {{2023}},
}

@inproceedings{48410,
  author       = {{Wiechmann, Jana and Rautenberg, Frederik and Wagner, Petra and Haeb-Umbach, Reinhold}},
  booktitle    = {{20th International Congress of the Phonetic Sciences (ICPhS) }},
  title        = {{{Explaining voice characteristics to novice voice practitioners-How successful is it?}}},
  year         = {{2023}},
}

@inproceedings{46069,
  author       = {{Seebauer, Fritz and Kuhlmann, Michael and Haeb-Umbach, Reinhold and Wagner, Petra}},
  booktitle    = {{12th Speech Synthesis Workshop (SSW) 2023}},
  title        = {{{Re-examining the quality dimensions of synthetic speech}}},
  year         = {{2023}},
}

@article{35602,
  abstract     = {{Continuous Speech Separation (CSS) has been proposed to address speech overlaps during the analysis of realistic meeting-like conversations by eliminating any overlaps before further processing.
CSS separates a recording of arbitrarily many speakers into a small number of overlap-free output channels, where each output channel may contain speech of multiple speakers.
This is often done by applying a conventional separation model trained with Utterance-level Permutation Invariant Training (uPIT), which exclusively maps a speaker to an output channel, in sliding window approach called stitching.
Recently, we introduced an alternative training scheme called Graph-PIT that teaches the separation network to directly produce output streams in the required format without stitching.
It can handle an arbitrary number of speakers as long as never more of them overlap at the same time than the separator has output channels.
In this contribution, we further investigate the Graph-PIT training scheme.
We show in extended experiments that models trained with Graph-PIT also work in challenging reverberant conditions.
Models trained in this way are able to perform segment-less CSS, i.e., without stitching, and achieve comparable and often better separation quality than the conventional CSS with uPIT and stitching.
We simplify the training schedule for Graph-PIT with the recently proposed Source Aggregated Signal-to-Distortion Ratio (SA-SDR) loss.
It eliminates unfavorable properties of the previously used A-SDR loss and thus enables training with Graph-PIT from scratch.
Graph-PIT training relaxes the constraints w.r.t. the allowed numbers of speakers and speaking patterns which allows using a larger variety of training data.
Furthermore, we introduce novel signal-level evaluation metrics for meeting scenarios, namely the source-aggregated scale- and convolution-invariant Signal-to-Distortion Ratio (SA-SI-SDR and SA-CI-SDR), which are generalizations of the commonly used SDR-based metrics for the CSS case.}},
  author       = {{von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  issn         = {{2329-9290}},
  journal      = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{Continuous Speech Separation, Source Separation, Graph-PIT, Dynamic Programming, Permutation Invariant Training}},
  pages        = {{576--589}},
  publisher    = {{Institute of Electrical and Electronics Engineers (IEEE)}},
  title        = {{{Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria}}},
  doi          = {{10.1109/taslp.2022.3228629}},
  volume       = {{31}},
  year         = {{2023}},
}

@inproceedings{49109,
  abstract     = {{We propose a diarization system, that estimates “who spoke when” based on spatial information, to be used as a front-end of a meeting transcription system running on the signals gathered from an acoustic sensor network (ASN). Although the
spatial distribution of the microphones is advantageous, exploiting the spatial diversity for diarization and signal enhancement is challenging, because the microphones’ positions are typically unknown, and the recorded signals are initially unsynchronized in general. Here, we approach these issues by first blindly synchronizing the signals and then estimating time differences of arrival (TDOAs). The TDOA information is exploited to estimate the speakers’ activity, even in the presence of multiple speakers being simultaneously active. This speaker activity information serves as a guide for a spatial mixture model, on which basis the individual speaker’s signals are extracted via beamforming. Finally, the extracted signals are forwarded to a speech recognizer. Additionally, a novel initialization scheme for spatial mixture models based on the TDOA estimates is proposed. Experiments conducted on real recordings from the LibriWASN data set have shown that our proposed system is advantageous compared to a system using a spatial mixture model, which does not make use
of external diarization information.}},
  author       = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. Asilomar Conference on Signals, Systems, and Computers}},
  keywords     = {{Diarization, time difference of arrival, ad-hoc acoustic sensor network, meeting transcription}},
  title        = {{{Spatial Diarization for Meeting Transcription with Ad-Hoc Acoustic Sensor Networks}}},
  year         = {{2023}},
}

@inproceedings{44849,
  author       = {{Rautenberg, Frederik and Kuhlmann, Michael and Ebbers, Janek and Wiechmann, Jana and Seebauer, Fritz and Wagner, Petra and Haeb-Umbach, Reinhold}},
  booktitle    = {{Fortschritte der Akustik - DAGA 2023}},
  location     = {{Hamburg}},
  pages        = {{1409--1412}},
  title        = {{{Speech Disentanglement for Analysis and Modification of Acoustic and Perceptual Speaker Characteristics}}},
  year         = {{2023}},
}