@unpublished{56273,
  abstract     = {{This paper presents the CHiME-8 DASR challenge which carries on from the
previous edition CHiME-7 DASR (C7DASR) and the past CHiME-6 challenge. It
focuses on joint multi-channel distant speech recognition (DASR) and
diarization with one or more, possibly heterogeneous, devices. The main goal is
to spur research towards meeting transcription approaches that can generalize
across arbitrary number of speakers, diverse settings (formal vs. informal
conversations), meeting duration, wide-variety of acoustic scenarios and
different recording configurations. Novelties with respect to C7DASR include:
i) the addition of NOTSOFAR-1, an additional office/corporate meeting scenario,
ii) a manually corrected Mixer 6 development set, iii) a new track in which we
allow the use of large-language models (LLM) iv) a jury award mechanism to
encourage participants to explore also more practical and innovative solutions.
To lower the entry barrier for participants, we provide a standalone toolkit
for downloading and preparing such datasets as well as performing text
normalization and scoring their submissions. Furthermore, this year we also
provide two baseline systems, one directly inherited from C7DASR and based on
ESPnet and another one developed on NeMo and based on NeMo team submission in
last year C7DASR. Baseline system results suggest that the addition of the
NOTSOFAR-1 scenario significantly increases the task's difficulty due to its
high number of speakers and very short duration.}},
  author       = {{Cornell, Samuele and Park, Taejin and Huang, Steve and Boeddeker, Christoph and Chang, Xuankai and Maciejewski, Matthew and Wiesner, Matthew and Garcia, Paola and Watanabe, Shinji}},
  booktitle    = {{arXiv:2407.16447}},
  title        = {{{The CHiME-8 DASR Challenge for Generalizable and Array Agnostic Distant  Automatic Speech Recognition and Diarization}}},
  year         = {{2024}},
}

@article{52958,
  author       = {{Boeddeker, Christoph and Subramanian, Aswin Shanmugam and Wichern, Gordon and Haeb-Umbach, Reinhold and Le Roux, Jonathan}},
  issn         = {{2329-9290}},
  journal      = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{Electrical and Electronic Engineering, Acoustics and Ultrasonics, Computer Science (miscellaneous), Computational Mathematics}},
  pages        = {{1185--1197}},
  publisher    = {{Institute of Electrical and Electronics Engineers (IEEE)}},
  title        = {{{TS-SEP: Joint Diarization and Separation Conditioned on Estimated Speaker Embeddings}}},
  doi          = {{10.1109/taslp.2024.3350887}},
  volume       = {{32}},
  year         = {{2024}},
}

@inproceedings{56004,
  author       = {{von Neumann, Thilo and Boeddeker, Christoph and Cord-Landwehr, Tobias and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)}},
  publisher    = {{IEEE}},
  title        = {{{Meeting Recognition with Continuous Speech Separation and Transcription-Supported Diarization}}},
  doi          = {{10.1109/icasspw62465.2024.10625894}},
  year         = {{2024}},
}

@inproceedings{56272,
  author       = {{Boeddeker, Christoph and Cord-Landwehr, Tobias and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2024}},
  publisher    = {{ISCA}},
  title        = {{{Once more Diarization: Improving meeting transcription systems through segment-level speaker reassignment}}},
  doi          = {{10.21437/interspeech.2024-1286}},
  year         = {{2024}},
}

@inproceedings{57659,
  author       = {{Vieting, Peter and Berger, Simon and von Neumann, Thilo and Boeddeker, Christoph and Schlüter, Ralf and Haeb-Umbach, Reinhold}},
  booktitle    = {{2024 IEEE Spoken Language Technology Workshop (SLT)}},
  title        = {{{Combining TF-GridNet and Mixture Encoder for Continuous Speech Separation for Meeting Transcription}}},
  year         = {{2024}},
}

@inproceedings{57085,
  abstract     = {{We propose an approach for simultaneous diarization and separation of meeting data. It consists of a complex Angular Central Gaussian Mixture Model (cACGMM) for speech source separation, and a von-Mises-Fisher Mixture Model (VMFMM) for diarization in a joint statistical framework. Through the integration, both spatial and spectral information are exploited for diarization and separation. We also develop a method for counting the number of active speakers in a segment of a meeting to support block-wise processing. While the total number of speakers in a meeting may be known, it is usually not known on a per-segment level. With the proposed speaker counting, joint diarization and source separation can be done segment-by-segment, and the permutation problem across segments is solved, thus allowing for block-online processing in the future. Experimental results on the LibriCSS meeting corpus show that the integrated approach outperforms a cascaded approach of diarization and speech enhancement in terms of WER, both on a per-segment and on a per-meeting level.}},
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  keywords     = {{diarization, source separation, mixture model, meeting}},
  location     = {{Hyderabad, India}},
  title        = {{{Simultaneous Diarization and Separation of Meetings through the Integration of Statistical Mixture Models}}},
  doi          = {{10.1109/ICASSP49660.2025.10888445}},
  year         = {{2024}},
}

@inproceedings{53659,
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  location     = {{Seoul}},
  publisher    = {{IEEE}},
  title        = {{{Geodesic Interpolation of Frame-Wise Speaker Embeddings for the Diarization of Meeting Scenarios}}},
  doi          = {{10.1109/icassp48485.2024.10445911}},
  year         = {{2024}},
}

@inproceedings{48391,
  author       = {{Aralikatti, Rohith and Boeddeker, Christoph and Wichern, Gordon and Subramanian, Aswin and Le Roux, Jonathan}},
  booktitle    = {{ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  publisher    = {{IEEE}},
  title        = {{{Reverberation as Supervision For Speech Separation}}},
  doi          = {{10.1109/icassp49357.2023.10095022}},
  year         = {{2023}},
}

@article{35602,
  abstract     = {{Continuous Speech Separation (CSS) has been proposed to address speech overlaps during the analysis of realistic meeting-like conversations by eliminating any overlaps before further processing.
CSS separates a recording of arbitrarily many speakers into a small number of overlap-free output channels, where each output channel may contain speech of multiple speakers.
This is often done by applying a conventional separation model trained with Utterance-level Permutation Invariant Training (uPIT), which exclusively maps a speaker to an output channel, in sliding window approach called stitching.
Recently, we introduced an alternative training scheme called Graph-PIT that teaches the separation network to directly produce output streams in the required format without stitching.
It can handle an arbitrary number of speakers as long as never more of them overlap at the same time than the separator has output channels.
In this contribution, we further investigate the Graph-PIT training scheme.
We show in extended experiments that models trained with Graph-PIT also work in challenging reverberant conditions.
Models trained in this way are able to perform segment-less CSS, i.e., without stitching, and achieve comparable and often better separation quality than the conventional CSS with uPIT and stitching.
We simplify the training schedule for Graph-PIT with the recently proposed Source Aggregated Signal-to-Distortion Ratio (SA-SDR) loss.
It eliminates unfavorable properties of the previously used A-SDR loss and thus enables training with Graph-PIT from scratch.
Graph-PIT training relaxes the constraints w.r.t. the allowed numbers of speakers and speaking patterns which allows using a larger variety of training data.
Furthermore, we introduce novel signal-level evaluation metrics for meeting scenarios, namely the source-aggregated scale- and convolution-invariant Signal-to-Distortion Ratio (SA-SI-SDR and SA-CI-SDR), which are generalizations of the commonly used SDR-based metrics for the CSS case.}},
  author       = {{von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  issn         = {{2329-9290}},
  journal      = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{Continuous Speech Separation, Source Separation, Graph-PIT, Dynamic Programming, Permutation Invariant Training}},
  pages        = {{576--589}},
  publisher    = {{Institute of Electrical and Electronics Engineers (IEEE)}},
  title        = {{{Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria}}},
  doi          = {{10.1109/taslp.2022.3228629}},
  volume       = {{31}},
  year         = {{2023}},
}

@inproceedings{48281,
  abstract     = {{	We propose a general framework to compute the word error rate (WER) of ASR systems that process recordings containing multiple speakers at their input and that produce multiple output word sequences (MIMO).
	Such ASR systems are typically required, e.g., for meeting transcription.
	We provide an efficient implementation based on a dynamic programming search in a multi-dimensional Levenshtein distance tensor under the constraint that a reference utterance must be matched consistently with one hypothesis output. 
	This also results in an efficient implementation of the ORC WER which previously suffered from exponential complexity.
	We give an overview of commonly used WER definitions for multi-speaker scenarios and show that they are specializations of the above MIMO WER tuned to particular application scenarios. 
	We conclude with a  discussion of the pros and cons of the various WER definitions and a recommendation when to use which.}},
  author       = {{von Neumann, Thilo and Boeddeker, Christoph and Kinoshita, Keisuke and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  keywords     = {{Word Error Rate, Meeting Recognition, Levenshtein Distance}},
  publisher    = {{IEEE}},
  title        = {{{On Word Error Rate Definitions and Their Efficient Computation for Multi-Speaker Speech Recognition Systems}}},
  doi          = {{10.1109/icassp49357.2023.10094784}},
  year         = {{2023}},
}

@inproceedings{48275,
  abstract     = {{MeetEval is an open-source toolkit to evaluate  all kinds of meeting transcription systems.
It provides a unified interface for the computation of commonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER along other WER definitions.
We extend the cpWER computation by a temporal constraint to ensure that only words are identified as correct when the temporal alignment is plausible.
This leads to a better quality of the matching of the hypothesis string to the reference string that more closely resembles the actual transcription quality, and a system is penalized if it provides poor time annotations.
Since word-level timing information is often not available, we present a way to approximate exact word-level timings from segment-level timings (e.g., a sentence) and show that the approximation leads to a similar WER as a matching with exact word-level annotations.
At the same time, the time constraint leads to a speedup of the matching algorithm, which outweighs the additional overhead caused by processing the time stamps.}},
  author       = {{von Neumann, Thilo and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments}},
  keywords     = {{Speech Recognition, Word Error Rate, Meeting Transcription}},
  location     = {{Dublin}},
  title        = {{{MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems}}},
  year         = {{2023}},
}

@inproceedings{47128,
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  location     = {{Rhodes}},
  publisher    = {{IEEE}},
  title        = {{{Frame-Wise and Overlap-Robust Speaker Embeddings for Meeting Diarization}}},
  doi          = {{10.1109/icassp49357.2023.10095370}},
  year         = {{2023}},
}

@inproceedings{47129,
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and Zorilă, Cătălin and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2023}},
  publisher    = {{ISCA}},
  title        = {{{A Teacher-Student Approach for Extracting Informative Speaker Embeddings From Speech Mixtures}}},
  doi          = {{10.21437/interspeech.2023-1379}},
  year         = {{2023}},
}

@inproceedings{54439,
  author       = {{Boeddeker, Christoph and Cord-Landwehr, Tobias and von Neumann, Thilo and Haeb-Umbach, Reinhold}},
  booktitle    = {{7th International Workshop on Speech Processing in Everyday Environments (CHiME 2023)}},
  publisher    = {{ISCA}},
  title        = {{{Multi-stage diarization refinement for the CHiME-7 DASR scenario}}},
  doi          = {{10.21437/chime.2023-10}},
  year         = {{2023}},
}

@inproceedings{48390,
  author       = {{Berger, Simon and Vieting, Peter and Boeddeker, Christoph and Schlüter, Ralf and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2023}},
  publisher    = {{ISCA}},
  title        = {{{Mixture Encoder for Joint Speech Separation and Recognition}}},
  doi          = {{10.21437/interspeech.2023-1815}},
  year         = {{2023}},
}

@article{33669,
  abstract     = {{Far-field multi-speaker automatic speech recognition (ASR) has drawn increasing attention in recent years. Most existing methods feature a signal processing frontend and an ASR backend. In realistic scenarios, these modules are usually trained separately or progressively, which suffers from either inter-module mismatch or a complicated training process. In this paper, we propose an end-to-end multi-channel model that jointly optimizes the speech enhancement (including speech dereverberation, denoising, and separation) frontend and the ASR backend as a single system. To the best of our knowledge, this is the first work that proposes to optimize dereverberation, beamforming, and multi-speaker ASR in a fully end-to-end manner. The frontend module consists of a weighted prediction error (WPE) based submodule for dereverberation and a neural beamformer for denoising and speech separation. For the backend, we adopt a widely used end-to-end (E2E) ASR architecture. It is worth noting that the entire model is differentiable and can be optimized in a fully end-to-end manner using only the ASR criterion, without the need of parallel signal-level labels. We evaluate the proposed model on several multi-speaker benchmark datasets, and experimental results show that the fully E2E ASR model can achieve competitive performance on both noisy and reverberant conditions, with over 30% relative word error rate (WER) reduction over the single-channel baseline systems.}},
  author       = {{Zhang, Wangyou and Chang, Xuankai and Boeddeker, Christoph and Nakatani, Tomohiro and Watanabe, Shinji and Qian, Yanmin}},
  issn         = {{Print ISSN: 2329-9290 Electronic ISSN: 2329-9304}},
  journal      = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}},
  title        = {{{End-to-End Dereverberation, Beamforming, and Speech Recognition in A Cocktail Party}}},
  doi          = {{10.1109/TASLP.2022.3209942}},
  year         = {{2022}},
}

@inproceedings{33847,
  abstract     = {{The scope of speech enhancement has changed from a monolithic view of single,
independent tasks, to a joint processing of complex conversational speech
recordings. Training and evaluation of these single tasks requires synthetic
data with access to intermediate signals that is as close as possible to the
evaluation scenario. As such data often is not available, many works instead
use specialized databases for the training of each system component, e.g
WSJ0-mix for source separation. We present a Multi-purpose Multi-Speaker
Mixture Signal Generator (MMS-MSG) for generating a variety of speech mixture
signals based on any speech corpus, ranging from classical anechoic mixtures
(e.g., WSJ0-mix) over reverberant mixtures (e.g., SMS-WSJ) to meeting-style
data. Its highly modular and flexible structure allows for the simulation of
diverse environments and dynamic mixing, while simultaneously enabling an easy
extension and modification to generate new scenarios and mixture types. These
meetings can be used for prototyping, evaluation, or training purposes. We
provide example evaluation data and baseline results for meetings based on the
WSJ corpus. Further, we demonstrate the usefulness for realistic scenarios by
using MMS-MSG to provide training data for the LibriCSS database.}},
  author       = {{Cord-Landwehr, Tobias and von Neumann, Thilo and Boeddeker, Christoph and Haeb-Umbach, Reinhold}},
  booktitle    = {{2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}},
  location     = {{Bamberg}},
  title        = {{{MMS-MSG: A Multi-purpose Multi-Speaker Mixture Signal Generator}}},
  year         = {{2022}},
}

@inproceedings{33848,
  abstract     = {{Impressive progress in neural network-based single-channel speech source
separation has been made in recent years. But those improvements have been
mostly reported on anechoic data, a situation that is hardly met in practice.
Taking the SepFormer as a starting point, which achieves state-of-the-art
performance on anechoic mixtures, we gradually modify it to optimize its
performance on reverberant mixtures. Although this leads to a word error rate
improvement by 7 percentage points compared to the standard SepFormer
implementation, the system ends up with only marginally better performance than
a PIT-BLSTM separation system, that is optimized with rather straightforward
means. This is surprising and at the same time sobering, challenging the
practical usefulness of many improvements reported in recent years for monaural
source separation on nonreverberant data.}},
  author       = {{Cord-Landwehr, Tobias and Boeddeker, Christoph and von Neumann, Thilo and Zorila, Catalin and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}},
  publisher    = {{IEEE}},
  title        = {{{Monaural source separation: From anechoic to reverberant environments}}},
  year         = {{2022}},
}

@inproceedings{33819,
  author       = {{von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  publisher    = {{IEEE}},
  title        = {{{SA-SDR: A Novel Loss Function for Separation of Meeting Style Data}}},
  doi          = {{10.1109/icassp43922.2022.9746757}},
  year         = {{2022}},
}

@misc{33816,
  author       = {{Gburrek, Tobias and Boeddeker, Christoph and von Neumann, Thilo and Cord-Landwehr, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  publisher    = {{arXiv}},
  title        = {{{A Meeting Transcription System for an Ad-Hoc Acoustic Sensor Network}}},
  doi          = {{10.48550/ARXIV.2205.00944}},
  year         = {{2022}},
}