@inproceedings{59999,
  author       = {{Rautenberg, Frederik and Kuhlmann, Michael and Seebauer, Fritz and Wiechmann, Jana and Wagner, Petra and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  location     = {{Hyderabad, India }},
  publisher    = {{IEEE}},
  title        = {{{Speech Synthesis along Perceptual Voice Quality Dimensions}}},
  doi          = {{10.1109/icassp49660.2025.10888012}},
  year         = {{2025}},
}

@inproceedings{61047,
  author       = {{Rautenberg, Frederik and Seebauer, Fritz and Wiechmann, Jana and Kuhlmann, Michael and Wagner, Petra and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2025}},
  location     = {{Rotterdam}},
  publisher    = {{ISCA}},
  title        = {{{Synthesizing Speech with Selected Perceptual Voice Qualities – A Case Study with Creaky Voice}}},
  doi          = {{10.21437/Interspeech.2025-1443}},
  year         = {{2025}},
}

@inproceedings{61079,
  abstract     = {{We propose a spatio-spectral, combined model-based and data-driven
diarization pipeline consisting of TDOA-based segmentation followed by
embedding-based clustering. The proposed system requires neither access to
multi-channel training data nor prior knowledge about the number or placement
of microphones. It works for both a compact microphone array and distributed
microphones, with minor adjustments. Due to its superior handling of
overlapping speech during segmentation, the proposed pipeline significantly
outperforms the single-channel pyannote approach, both in a scenario with a
compact microphone array and in a setup with distributed microphones.
Additionally, we show that, unlike fully spatial diarization pipelines, the
proposed system can correctly track speakers when they change positions.}},
  author       = {{Cord-Landwehr, Tobias and Gburrek, Tobias and Deegen, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proceedings of INTERSPEECH}},
  location     = {{Rotterdam}},
  title        = {{{Spatio-spectral diarization of meetings by combining TDOA-based  segmentation and speaker embedding-based clustering}}},
  doi          = {{10.21437/Interspeech.2025-1663}},
  year         = {{2025}},
}

@inproceedings{62164,
  author       = {{Kuhlmann, Michael and Seebauer, Fritz and Wagner, Petra and Häb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2025}},
  publisher    = {{ISCA}},
  title        = {{{Towards Frame-level Quality Predictions of Synthetic Speech}}},
  doi          = {{10.21437/interspeech.2025-2190}},
  year         = {{2025}},
}

@inproceedings{62163,
  abstract     = {{Zero-shot classifiers based on Contrastive Language-Audio Pretraining (CLAP) models enable classification of given audio into classes defined at test time using text. These models are costly to run with respect to computation and memory requirements. In this work, we propose to build a specialized low-resource classifier for classes pre-defined using text, using a two-stage procedure consisting of zero-shot data set pruning and model compression. First, relevant in-domain data is selected from a source dataset using class label embeddings obtained from a pre-trained CLAP model. This data is then used to distill the audio encoder of a CLAP model. The proposed compression method produces compact audio encoders with slightly reduced accuracy. Note that neither labeled nor unlabeled in-domain audio data is required for its development. We verify by cross-dataset tests that the resulting classifiers are indeed specialized to their task.}},
  author       = {{Werning, Alexander and Häb-Umbach, Reinhold}},
  booktitle    = {{Proceedings of the 16th ITG Conference on Speech Communication}},
  editor       = {{Möller, Sebastian and Gerkmann, Timo and Kolossa, Dorothea}},
  location     = {{Berlin}},
  pages        = {{76--80}},
  title        = {{{A Fully Zero-Shot Approach to Obtaining Specialized and Compact Audio Tagging Models}}},
  year         = {{2025}},
}

@inproceedings{59900,
  abstract     = {{Running state-of-the-art large-scale audio models on edge devices is often infeasible due to their limited storage and computing resources. It is therefore necessary to compress and tune the models for the specific target task and hardware. This is commonly achieved by distilling the audio model, the teacher, to a small target model, the student. However, this approach can be improved by prepending a dataset pruning stage and training the teacher on the pruned data set only, which contains examples relevant to the target task. Recently, CLAP models have emerged that embed audio and text examples in a common embedding space. We use the audio embeddings of the CLAP model for the above pruning stage, which is realized using a domain classifier. After knowledge distillation, the student is eventually fine-tuned on some data from the target domain. The CLAP architecture combines text and audio embedding spaces, which allows to search for data given only a textual description, such as a class label. We show how this can help data pruning.}},
  author       = {{Werning, Alexander and Häb-Umbach, Reinhold}},
  booktitle    = {{Proceedings of DAS|DAGA 2025}},
  location     = {{Copenhagen}},
  title        = {{{Distilling Efficient Audio Models using Data Pruning with CLAP}}},
  year         = {{2025}},
}

@inproceedings{62174,
  author       = {{Meise, Adrian Tobias and Cord-Landwehr, Tobias and Haeb-Umbach, Reinhold}},
  booktitle    = {{ ITG Conference on Speech Communication}},
  isbn         = {{978-3-8007-6617-8}},
  location     = {{Berlin}},
  title        = {{{On the Application of Diffusion Models for Simultaneous Denoising and Dereverberation}}},
  year         = {{2025}},
}

@unpublished{56273,
  abstract     = {{This paper presents the CHiME-8 DASR challenge which carries on from the
previous edition CHiME-7 DASR (C7DASR) and the past CHiME-6 challenge. It
focuses on joint multi-channel distant speech recognition (DASR) and
diarization with one or more, possibly heterogeneous, devices. The main goal is
to spur research towards meeting transcription approaches that can generalize
across arbitrary number of speakers, diverse settings (formal vs. informal
conversations), meeting duration, wide-variety of acoustic scenarios and
different recording configurations. Novelties with respect to C7DASR include:
i) the addition of NOTSOFAR-1, an additional office/corporate meeting scenario,
ii) a manually corrected Mixer 6 development set, iii) a new track in which we
allow the use of large-language models (LLM) iv) a jury award mechanism to
encourage participants to explore also more practical and innovative solutions.
To lower the entry barrier for participants, we provide a standalone toolkit
for downloading and preparing such datasets as well as performing text
normalization and scoring their submissions. Furthermore, this year we also
provide two baseline systems, one directly inherited from C7DASR and based on
ESPnet and another one developed on NeMo and based on NeMo team submission in
last year C7DASR. Baseline system results suggest that the addition of the
NOTSOFAR-1 scenario significantly increases the task's difficulty due to its
high number of speakers and very short duration.}},
  author       = {{Cornell, Samuele and Park, Taejin and Huang, Steve and Boeddeker, Christoph and Chang, Xuankai and Maciejewski, Matthew and Wiesner, Matthew and Garcia, Paola and Watanabe, Shinji}},
  booktitle    = {{arXiv:2407.16447}},
  title        = {{{The CHiME-8 DASR Challenge for Generalizable and Array Agnostic Distant  Automatic Speech Recognition and Diarization}}},
  year         = {{2024}},
}

@article{52958,
  author       = {{Boeddeker, Christoph and Subramanian, Aswin Shanmugam and Wichern, Gordon and Haeb-Umbach, Reinhold and Le Roux, Jonathan}},
  issn         = {{2329-9290}},
  journal      = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{Electrical and Electronic Engineering, Acoustics and Ultrasonics, Computer Science (miscellaneous), Computational Mathematics}},
  pages        = {{1185--1197}},
  publisher    = {{Institute of Electrical and Electronics Engineers (IEEE)}},
  title        = {{{TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings}}},
  doi          = {{10.1109/taslp.2024.3350887}},
  volume       = {{32}},
  year         = {{2024}},
}

@techreport{57161,
  author       = {{Werning, Alexander and Haeb-Umbach, Reinhold}},
  title        = {{{UPB-NT submission to DCASE24: Dataset pruning for targeted knowledge distillation}}},
  year         = {{2024}},
}

@inproceedings{57099,
  author       = {{Xie, Yuying and Kuhlmann, Michael and Rautenberg, Frederik and Tan, Zheng-Hua and Häb-Umbach, Reinhold}},
  booktitle    = {{2024 32nd European Signal Processing Conference (EUSIPCO)}},
  pages        = {{436–440}},
  title        = {{{Speaker and Style Disentanglement of Speech Based on Contrastive Predictive Coding Supported Factorized Variational Autoencoder}}},
  year         = {{2024}},
}

@inproceedings{56004,
  author       = {{von Neumann, Thilo and Boeddeker, Christoph and Cord-Landwehr, Tobias and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)}},
  publisher    = {{IEEE}},
  title        = {{{Meeting Recognition with Continuous Speech Separation and Transcription-Supported Diarization}}},
  doi          = {{10.1109/icasspw62465.2024.10625894}},
  year         = {{2024}},
}

@inproceedings{56272,
  author       = {{Boeddeker, Christoph and Cord-Landwehr, Tobias and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2024}},
  publisher    = {{ISCA}},
  title        = {{{Once more Diarization: Improving meeting transcription systems through segment-level speaker reassignment}}},
  doi          = {{10.21437/interspeech.2024-1286}},
  year         = {{2024}},
}

@inproceedings{57659,
  author       = {{Vieting, Peter and Berger, Simon and von Neumann, Thilo and Boeddeker, Christoph and Schlüter, Ralf and Haeb-Umbach, Reinhold}},
  booktitle    = {{2024 IEEE Spoken Language Technology Workshop (SLT)}},
  title        = {{{Combining TF-GridNet and Mixture Encoder for Continuous Speech Separation for Meeting Transcription}}},
  year         = {{2024}},
}

@inproceedings{57085,
  abstract     = {{We propose an approach for simultaneous diarization and separation of meeting data. It consists of a complex Angular Central Gaussian Mixture Model (cACGMM) for speech source separation, and a von-Mises-Fisher Mixture Model (VMFMM) for diarization in a joint statistical framework. Through the integration, both spatial and spectral information are exploited for diarization and separation. We also develop a method for counting the number of active speakers in a segment of a meeting to support block-wise processing. While the total number of speakers in a meeting may be known, it is usually not known on a per-segment level. With the proposed speaker counting, joint diarization and source separation can be done segment-by-segment, and the permutation problem across segments is solved, thus allowing for block-online processing in the future. Experimental results on the LibriCSS meeting corpus show that the integrated approach outperforms a cascaded approach of diarization and speech enhancement in terms of WER, both on a per-segment and on a per-meeting level.}},
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  keywords     = {{diarization, source separation, mixture model, meeting}},
  location     = {{Hyderabad, India}},
  title        = {{{Simultaneous Diarization and Separation of Meetings through the Integration of Statistical Mixture Models}}},
  doi          = {{10.1109/ICASSP49660.2025.10888445}},
  year         = {{2024}},
}

@inproceedings{53659,
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  location     = {{Seoul}},
  publisher    = {{IEEE}},
  title        = {{{Geodesic Interpolation of Frame-Wise Speaker Embeddings for the Diarization of Meeting Scenarios}}},
  doi          = {{10.1109/icassp48485.2024.10445911}},
  year         = {{2024}},
}

@inproceedings{57160,
  abstract     = {{Large audio tagging models are usually trained or pre-trained on AudioSet, a dataset that encompasses a large amount of different sound classes and acoustic environments. Knowledge distillation has emerged as a method to compress such models without compromising their effectiveness. There are many different applications for audio tagging, some of which require a specialization to a narrow domain of sounds to be classified. For these scenarios, it is beneficial to distill the large audio tagger with respect to a specific subset of sounds of interest. A method to prune a general dataset with respect to a target dataset is presented. By distilling with such a specialized pruned dataset, we obtain a compressed model with better classification accuracy in the specific target domain than with target-agnostic distillation.}},
  author       = {{Werning, Alexander and Haeb-Umbach, Reinhold}},
  booktitle    = {{32nd European Signal Processing Conference (EUSIPCO 2024)}},
  keywords     = {{data pruning, knowledge distillation, audio tagging}},
  location     = {{Lyon}},
  title        = {{{Target-Specific Dataset Pruning for Compression of Audio Tagging Models}}},
  year         = {{2024}},
}

@inproceedings{57031,
  author       = {{Gburrek, Tobias and Meise, Adrian Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{2024 18th International Workshop on Acoustic Signal Enhancement (IWAENC)}},
  publisher    = {{IEEE}},
  title        = {{{Diminishing Domain Mismatch for DNN-Based Acoustic Distance Estimation via Stochastic Room Reverberation Models}}},
  doi          = {{10.1109/iwaenc61483.2024.10694103}},
  year         = {{2024}},
}

@inproceedings{48269,
  author       = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{European Signal Processing Conference (EUSIPCO)}},
  location     = {{Helsinki}},
  title        = {{{On the Integration of Sampling Rate Synchronization and Acoustic Beamforming}}},
  year         = {{2023}},
}

@inproceedings{48270,
  author       = {{Schmalenstroeer, Joerg and Gburrek, Tobias and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG Conference on Speech Communication}},
  location     = {{Aachen}},
  title        = {{{LibriWASN: A Data Set for Meeting Separation, Diarization, and Recognition with Asynchronous Recording Devices}}},
  year         = {{2023}},
}

