@inproceedings{61079,
  abstract     = {{We propose a spatio-spectral, combined model-based and data-driven
diarization pipeline consisting of TDOA-based segmentation followed by
embedding-based clustering. The proposed system requires neither access to
multi-channel training data nor prior knowledge about the number or placement
of microphones. It works for both a compact microphone array and distributed
microphones, with minor adjustments. Due to its superior handling of
overlapping speech during segmentation, the proposed pipeline significantly
outperforms the single-channel pyannote approach, both in a scenario with a
compact microphone array and in a setup with distributed microphones.
Additionally, we show that, unlike fully spatial diarization pipelines, the
proposed system can correctly track speakers when they change positions.}},
  author       = {{Cord-Landwehr, Tobias and Gburrek, Tobias and Deegen, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proceedings of INTERSPEECH}},
  location     = {{Rotterdam}},
  title        = {{{Spatio-spectral diarization of meetings by combining TDOA-based  segmentation and speaker embedding-based clustering}}},
  doi          = {{10.21437/Interspeech.2025-1663}},
  year         = {{2025}},
}

@inproceedings{62174,
  author       = {{Meise, Adrian Tobias and Cord-Landwehr, Tobias and Haeb-Umbach, Reinhold}},
  booktitle    = {{ ITG Conference on Speech Communication}},
  isbn         = {{978-3-8007-6617-8}},
  location     = {{Berlin}},
  title        = {{{On the Application of Diffusion Models for Simultaneous Denoising and Dereverberation}}},
  year         = {{2025}},
}

@inproceedings{56004,
  author       = {{von Neumann, Thilo and Boeddeker, Christoph and Cord-Landwehr, Tobias and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)}},
  publisher    = {{IEEE}},
  title        = {{{Meeting Recognition with Continuous Speech Separation and Transcription-Supported Diarization}}},
  doi          = {{10.1109/icasspw62465.2024.10625894}},
  year         = {{2024}},
}

@inproceedings{56272,
  author       = {{Boeddeker, Christoph and Cord-Landwehr, Tobias and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2024}},
  publisher    = {{ISCA}},
  title        = {{{Once more Diarization: Improving meeting transcription systems through segment-level speaker reassignment}}},
  doi          = {{10.21437/interspeech.2024-1286}},
  year         = {{2024}},
}

@inproceedings{57085,
  abstract     = {{We propose an approach for simultaneous diarization and separation of meeting data. It consists of a complex Angular Central Gaussian Mixture Model (cACGMM) for speech source separation, and a von-Mises-Fisher Mixture Model (VMFMM) for diarization in a joint statistical framework. Through the integration, both spatial and spectral information are exploited for diarization and separation. We also develop a method for counting the number of active speakers in a segment of a meeting to support block-wise processing. While the total number of speakers in a meeting may be known, it is usually not known on a per-segment level. With the proposed speaker counting, joint diarization and source separation can be done segment-by-segment, and the permutation problem across segments is solved, thus allowing for block-online processing in the future. Experimental results on the LibriCSS meeting corpus show that the integrated approach outperforms a cascaded approach of diarization and speech enhancement in terms of WER, both on a per-segment and on a per-meeting level.}},
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  keywords     = {{diarization, source separation, mixture model, meeting}},
  location     = {{Hyderabad, India}},
  title        = {{{Simultaneous Diarization and Separation of Meetings through the Integration of Statistical Mixture Models}}},
  doi          = {{10.1109/ICASSP49660.2025.10888445}},
  year         = {{2024}},
}

@inproceedings{53659,
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  location     = {{Seoul}},
  publisher    = {{IEEE}},
  title        = {{{Geodesic Interpolation of Frame-Wise Speaker Embeddings for the Diarization of Meeting Scenarios}}},
  doi          = {{10.1109/icassp48485.2024.10445911}},
  year         = {{2024}},
}

@inproceedings{47128,
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  location     = {{Rhodes}},
  publisher    = {{IEEE}},
  title        = {{{Frame-Wise and Overlap-Robust Speaker Embeddings for Meeting Diarization}}},
  doi          = {{10.1109/icassp49357.2023.10095370}},
  year         = {{2023}},
}

@inproceedings{47129,
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2023}},
  publisher    = {{ISCA}},
  title        = {{{A Teacher-Student Approach for Extracting Informative Speaker Embeddings From Speech Mixtures}}},
  doi          = {{10.21437/interspeech.2023-1379}},
  year         = {{2023}},
}

@inproceedings{54439,
  author       = {{Boeddeker, Christoph and Cord-Landwehr, Tobias and von Neumann, Thilo and Haeb-Umbach, Reinhold}},
  booktitle    = {{7th International Workshop on Speech Processing in Everyday Environments (CHiME 2023)}},
  publisher    = {{ISCA}},
  title        = {{{Multi-stage diarization refinement for the CHiME-7 DASR scenario}}},
  doi          = {{10.21437/chime.2023-10}},
  year         = {{2023}},
}

@inproceedings{33847,
  abstract     = {{The scope of speech enhancement has changed from a monolithic view of single,
independent tasks, to a joint processing of complex conversational speech
recordings. Training and evaluation of these single tasks requires synthetic
data with access to intermediate signals that is as close as possible to the
evaluation scenario. As such data often is not available, many works instead
use specialized databases for the training of each system component, e.g
WSJ0-mix for source separation. We present a Multi-purpose Multi-Speaker
Mixture Signal Generator (MMS-MSG) for generating a variety of speech mixture
signals based on any speech corpus, ranging from classical anechoic mixtures
(e.g., WSJ0-mix) over reverberant mixtures (e.g., SMS-WSJ) to meeting-style
data. Its highly modular and flexible structure allows for the simulation of
diverse environments and dynamic mixing, while simultaneously enabling an easy
extension and modification to generate new scenarios and mixture types. These
meetings can be used for prototyping, evaluation, or training purposes. We
provide example evaluation data and baseline results for meetings based on the
WSJ corpus. Further, we demonstrate the usefulness for realistic scenarios by
using MMS-MSG to provide training data for the LibriCSS database.}},
  author       = {{Cord-Landwehr, Tobias and von Neumann, Thilo and Boeddeker, Christoph and Haeb-Umbach, Reinhold}},
  booktitle    = {{2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}},
  location     = {{Bamberg}},
  title        = {{{MMS-MSG: A Multi-purpose Multi-Speaker Mixture Signal Generator}}},
  year         = {{2022}},
}

@inproceedings{33848,
  abstract     = {{Impressive progress in neural network-based single-channel speech source
separation has been made in recent years. But those improvements have been
mostly reported on anechoic data, a situation that is hardly met in practice.
Taking the SepFormer as a starting point, which achieves state-of-the-art
performance on anechoic mixtures, we gradually modify it to optimize its
performance on reverberant mixtures. Although this leads to a word error rate
improvement by 7 percentage points compared to the standard SepFormer
implementation, the system ends up with only marginally better performance than
a PIT-BLSTM separation system, that is optimized with rather straightforward
means. This is surprising and at the same time sobering, challenging the
practical usefulness of many improvements reported in recent years for monaural
source separation on nonreverberant data.}},
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and von Neumann, Thilo and Zorila, Catalin and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}},
  publisher    = {{IEEE}},
  title        = {{{Monaural source separation: From anechoic to reverberant environments}}},
  year         = {{2022}},
}

@misc{33816,
  author       = {{Gburrek, Tobias and Boeddeker, Christoph and von Neumann, Thilo and Cord-Landwehr, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  publisher    = {{arXiv}},
  title        = {{{A Meeting Transcription System for an Ad-Hoc Acoustic Sensor Network}}},
  doi          = {{10.48550/ARXIV.2205.00944}},
  year         = {{2022}},
}

@inproceedings{33954,
  author       = {{Boeddeker, Christoph and Cord-Landwehr, Tobias and von Neumann, Thilo and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2022}},
  publisher    = {{ISCA}},
  title        = {{{An Initialization Scheme for Meeting Separation with Spatial Mixture Models}}},
  doi          = {{10.21437/interspeech.2022-10929}},
  year         = {{2022}},
}

@inproceedings{29304,
  abstract     = {{In this work we address disentanglement of style and content in speech signals. We propose a fully convolutional variational autoencoder employing two encoders: a content encoder and a style encoder. To foster disentanglement, we propose adversarial contrastive predictive coding. This new disentanglement method does neither need parallel data nor any supervision. We show that the proposed technique is capable of separating speaker and content traits into the two different representations and show competitive speaker-content disentanglement performance compared to other unsupervised approaches. We further demonstrate an increased robustness of the content representation against a train-test mismatch compared to spectral features, when used for phone recognition.}},
  author       = {{Ebbers, Janek and Kuhlmann, Michael and Cord-Landwehr, Tobias and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  pages        = {{3860–3864}},
  title        = {{{Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations}}},
  year         = {{2021}},
}

@inproceedings{20700,
  author       = {{Boeddeker, Christoph and Cord-Landwehr, Tobias and Heitkaemper, Jens and Zorila, Catalin and Hayakawa, Daichi and Li, Mohan and Liu, Min and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments}},
  title        = {{{Towards a speaker diarization system for the CHiME 2020 dinner party transcription}}},
  year         = {{2020}},
}

