@article{35602,
  abstract     = {{Continuous Speech Separation (CSS) has been proposed to address speech overlaps during the analysis of realistic meeting-like conversations by eliminating any overlaps before further processing.
CSS separates a recording of arbitrarily many speakers into a small number of overlap-free output channels, where each output channel may contain speech of multiple speakers.
This is often done by applying a conventional separation model trained with Utterance-level Permutation Invariant Training (uPIT), which exclusively maps a speaker to an output channel, in sliding window approach called stitching.
Recently, we introduced an alternative training scheme called Graph-PIT that teaches the separation network to directly produce output streams in the required format without stitching.
It can handle an arbitrary number of speakers as long as never more of them overlap at the same time than the separator has output channels.
In this contribution, we further investigate the Graph-PIT training scheme.
We show in extended experiments that models trained with Graph-PIT also work in challenging reverberant conditions.
Models trained in this way are able to perform segment-less CSS, i.e., without stitching, and achieve comparable and often better separation quality than the conventional CSS with uPIT and stitching.
We simplify the training schedule for Graph-PIT with the recently proposed Source Aggregated Signal-to-Distortion Ratio (SA-SDR) loss.
It eliminates unfavorable properties of the previously used A-SDR loss and thus enables training with Graph-PIT from scratch.
Graph-PIT training relaxes the constraints w.r.t. the allowed numbers of speakers and speaking patterns which allows using a larger variety of training data.
Furthermore, we introduce novel signal-level evaluation metrics for meeting scenarios, namely the source-aggregated scale- and convolution-invariant Signal-to-Distortion Ratio (SA-SI-SDR and SA-CI-SDR), which are generalizations of the commonly used SDR-based metrics for the CSS case.}},
  author       = {{von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  issn         = {{2329-9290}},
  journal      = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{Continuous Speech Separation, Source Separation, Graph-PIT, Dynamic Programming, Permutation Invariant Training}},
  pages        = {{576--589}},
  publisher    = {{Institute of Electrical and Electronics Engineers (IEEE)}},
  title        = {{{Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria}}},
  doi          = {{10.1109/taslp.2022.3228629}},
  volume       = {{31}},
  year         = {{2023}},
}

@inproceedings{48275,
  abstract     = {{MeetEval is an open-source toolkit to evaluate  all kinds of meeting transcription systems.
It provides a unified interface for the computation of commonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER along other WER definitions.
We extend the cpWER computation by a temporal constraint to ensure that only words are identified as correct when the temporal alignment is plausible.
This leads to a better quality of the matching of the hypothesis string to the reference string that more closely resembles the actual transcription quality, and a system is penalized if it provides poor time annotations.
Since word-level timing information is often not available, we present a way to approximate exact word-level timings from segment-level timings (e.g., a sentence) and show that the approximation leads to a similar WER as a matching with exact word-level annotations.
At the same time, the time constraint leads to a speedup of the matching algorithm, which outweighs the additional overhead caused by processing the time stamps.}},
  author       = {{von Neumann, Thilo and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments}},
  keywords     = {{Speech Recognition, Word Error Rate, Meeting Transcription}},
  location     = {{Dublin}},
  title        = {{{MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems}}},
  year         = {{2023}},
}

@inproceedings{26770,
  abstract     = {{Automatic transcription of meetings requires handling of overlapped speech, which calls for continuous speech separation (CSS) systems. The uPIT criterion was proposed for utterance-level separation with neural networks and introduces the constraint that the total number of speakers must not exceed the number of output channels. When processing meeting-like data in a segment-wise manner, i.e., by separating overlapping segments independently and stitching adjacent segments to continuous output streams, this constraint has to be fulfilled for any segment. In this contribution, we show that this constraint can be significantly relaxed. We propose a novel graph-based PIT criterion, which casts the assignment of utterances to output channels in a graph coloring problem. It only requires that the number of concurrently active speakers must not exceed the number of output channels. As a consequence, the system can process an arbitrary number of speakers and arbitrarily long segments and thus can handle more diverse scenarios.
Further, the stitching algorithm for obtaining a consistent output order in neighboring segments is of less importance and can even be eliminated completely, not the least reducing the computational effort. Experiments on meeting-style WSJ data show improvements in recognition performance over using the uPIT criterion. }},
  author       = {{von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2021}},
  keywords     = {{Continuous speech separation, automatic speech recognition, overlapped speech, permutation invariant training}},
  title        = {{{Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers}}},
  doi          = {{10.21437/interspeech.2021-1177}},
  year         = {{2021}},
}

@article{57971,
  abstract     = {{Repetitive TMS (rTMS) with a frequency of 5-10~Hz is widely used for language mapping. However, it may be accompanied by discomfort and is limited in the number and reliability of evoked language errors. We, here, systematically tested the influence of different stimulation frequencies (i.e., 10, 30, and 50 Hz) on tolerability, number, reliability, and cortical distribution of language errors aiming at improved language mapping. 15 right-handed, healthy subjects (m~=~8, median age: 29 yrs) were investigated in two sessions, separated by 2-5 days. In each session, 10, 30, and 50 Hz rTMS were applied over the left hemisphere in a randomized order during a picture naming task. Overall, 30 Hz rTMS evoked significantly more errors (20 $\pm$ 12{%}) compared to 50 Hz (12 $\pm$ 8{%}; p {\textless}.01), whereas error rates were comparable between 30/50 and 10~Hz (18 $\pm$ 11{%}). Across all conditions, a significantly higher error rate was found in Session 1 (19 $\pm$ 13{%}) compared to Session 2 (13 $\pm$ 7{%}, p {\textless}.05). The error rate was poorly reliable between sessions for 10 (intraclass correlation coefficient, ICC~=~.315) and 30 Hz (ICC~=~.427), whereas 50 Hz showed a moderate reliability (ICC~=~.597). Spatial reliability of language errors was low to moderate with a tendency toward increased reliability for higher frequencies, for example, within frontal regions. Compared to 10~Hz, both, 30 and 50 Hz were rated as less painful. Taken together, our data favor the use of rTMS-protocols employing higher frequencies for evoking language errors reliably and with reduced discomfort, depending on the region of interest.}},
  author       = {{Nettekoven, Charlotte and Pieczewski, Julia and Neuschmelting, Volker and Jonas, Kristina and Goldbrunner, Roland and Grefkes, Christian and Weiss Lucas, Carolin}},
  journal      = {{Human brain mapping}},
  keywords     = {{Adult, Brain Mapping, Cerebral Cortex/diagnostic imaging/physiology, Female, Humans, Magnetic Resonance Imaging, Male, Pattern Recognition, Psycholinguistics, Reproducibility of Results, Speech/physiology, Transcranial Magnetic Stimulation, Visual/physiology, Young Adult}},
  number       = {{16}},
  pages        = {{5309–5321}},
  title        = {{{Improving the efficacy and reliability of rTMS language mapping by increasing the stimulation frequency}}},
  doi          = {{10.1002/hbm.25619}},
  volume       = {{42}},
  year         = {{2021}},
}

@inproceedings{20504,
  abstract     = {{In recent years time domain speech separation has excelled over frequency domain separation in single channel scenarios and noise-free environments. In this paper we dissect the gains of the time-domain audio separation network (TasNet) approach by gradually replacing components of an utterance-level permutation invariant training (u-PIT) based separation system in the frequency domain until the TasNet system is reached, thus blending components of frequency domain approaches with those of time domain approaches. Some of the intermediate variants achieve comparable signal-to-distortion ratio (SDR) gains to TasNet, but retain the advantage of frequency domain processing: compatibility with classic signal processing tools such as frequency-domain beamforming and the human interpretability of the masks. Furthermore, we show that the scale invariant signal-to-distortion ratio (si-SDR) criterion used as loss function in TasNet is related to a logarithmic mean square error criterion and that it is this criterion which contributes most reliable to the performance advantage of TasNet. Finally, we critically assess which gains in a noise-free single channel environment generalize to more realistic reverberant conditions.}},
  author       = {{Heitkaemper, Jens and Jakobeit, Darius and Boeddeker, Christoph and Drude, Lukas and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2020 Virtual Barcelona Spain}},
  keywords     = {{voice activity detection, speech activity detection, neural network, statistical speech processing}},
  title        = {{{Demystifying TasNet: A Dissecting Approach}}},
  year         = {{2020}},
}

@inproceedings{20505,
  abstract     = {{Speech activity detection (SAD), which often rests on the fact that the noise is "more'' stationary than speech, is particularly challenging in non-stationary environments, because the time variance of the acoustic scene makes it difficult to discriminate  speech from noise. We propose two approaches to SAD, where one is based on statistical signal processing, while the other utilizes neural networks. The former employs sophisticated signal processing to track the noise and speech energies and is meant to support the case for a resource efficient, unsupervised signal processing approach.
The latter introduces a recurrent network layer that operates on short segments of the input speech to do temporal smoothing in the presence of non-stationary noise. The systems are tested on the Fearless Steps challenge database, which consists of the transmission data from the Apollo-11 space mission.
The statistical SAD  achieves comparable detection performance to earlier proposed neural network based SADs, while the neural network based approach leads to a decision cost function of 1.07% on the evaluation set of the 2020 Fearless Steps Challenge, which sets a new state of the art.}},
  author       = {{Heitkaemper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2020 Virtual Shanghai China}},
  keywords     = {{voice activity detection, speech activity detection, neural network, statistical speech processing}},
  title        = {{{Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments}}},
  year         = {{2020}},
}

@inproceedings{17557,
  abstract     = {{Previous work by [1] studied gesture-speech interaction in adults. [1] focussed on temporal and semantic coordination of gesture and speech and found that while adult speech is mostly coordinated (or redundant) with gestures, semantic coordination increases the temporal synchrony. These observations do not necessarily hold for children (in particular with respect to iconic gestures, see [2]), where the speech and gesture systems are still under development. We studied the semantic and temporal coordination of speech and gesture in 4-year old children using a corpus of 40 children producing action descriptions in task oriented dialogues. In particular, we examined what kinds of information are transmitted verbally vs. non-verbally and how they are related. To account for this, we extended the semantic features (SFs) developed in [3] for object descriptions in order to include the semantics of actions. We coded the SFs on the children’s speech and gestures separately using video data. In our presentation, we will focus on the quantitative distribution of SFs across gesture and speech. Our results indicate that speech and gestures of 4-year olds are less integrated than those of the adults, although there is a large variability among the children. We will discuss the results with respect to the cognitive processes (e.g., visual memory, language) underlying children’s abilities at this stage of development. Our work paves the way for the cognitive architecture of speech-gesture interaction in preschoolers which to our knowledge is missing so far. }},
  author       = {{Abramov, Olga and Kopp, Stefan and Nemeth, Anne and Kern, Friederike and Mertens, Ulrich and Rohlfing, Katharina}},
  booktitle    = {{KOGWIS2018: Computational Approaches to Cognitive Science}},
  keywords     = {{Speech-gesture integration, semantic features}},
  title        = {{{Towards a Computational Model of Child Gesture-Speech Production}}},
  year         = {{2018}},
}

@inproceedings{17179,
  abstract     = {{Previous work by [1] studied gesture-speech interaction in adults. [1] focussed on temporal and semantic coordination of gesture and speech and found that while adult speech is mostly coordinated (or redundant) with gestures, semantic coordination increases the temporal synchrony. These observations do not necessarily hold for children (in particular with respect to iconic gestures, see [2]), where the speech and gesture systems are still under development. We studied the semantic and temporal coordination of speech and gesture in 4-year old children using a corpus of 40 children producing action descriptions in task oriented dialogues. In particular, we examined what kinds of information are transmitted verbally vs. non-verbally and how they are related. To account for this, we extended the semantic features (SFs) developed in [3] for object descriptions in order to include the semantics of actions. We coded the SFs on the children’s speech and gestures separately using video data. In our presentation, we will focus on the quantitative distribution of SFs across gesture and speech. Our results indicate that speech and gestures of 4-year olds are less integrated than those of the adults, although there is a large variability among the children. We will discuss the results with respect to the cognitive processes (e.g., visual memory, language) underlying children’s abilities at this stage of development. Our work paves the way for the cognitive architecture of speech-gesture interaction in preschoolers which to our knowledge is missing so far. }},
  author       = {{Abramov, Olga and Kopp, Stefan and Nemeth, Anne and Kern, Friederike and Mertens, Ulrich and Rohlfing, Katharina}},
  booktitle    = {{KOGWIS2018: Computational Approaches to Cognitive Science}},
  keywords     = {{Speech-gesture integration, semantic features}},
  title        = {{{Towards a Computational Model of Child Gesture-Speech Production}}},
  year         = {{2018}},
}

@inproceedings{11739,
  abstract     = {{Noise tracking is an important component of speech enhancement algorithms. Of the many noise trackers proposed, Minimum Statistics (MS) is a particularly popular one due to its simple parameterization and at the same time excellent performance. In this paper we propose to further reduce the number of MS parameters by giving an alternative derivation of an optimal smoothing constant. At the same time the noise tracking performance is improved as is demonstrated by experiments employing speech degraded by various noise types and at different SNR values.}},
  author       = {{Chinaev, Aleksej and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2015}},
  keywords     = {{speech enhancement, noise tracking, optimal smoothing}},
  pages        = {{1785--1789}},
  title        = {{{On Optimal Smoothing in Minimum Statistics Based Noise Tracking}}},
  year         = {{2015}},
}

@inproceedings{11813,
  abstract     = {{The parametric Bayesian Feature Enhancement (BFE) and a datadriven Denoising Autoencoder (DA) both bring performance gains in severe single-channel speech recognition conditions. The first can be adjusted to different conditions by an appropriate parameter setting, while the latter needs to be trained on conditions similar to the ones expected at decoding time, making it vulnerable to a mismatch between training and test conditions. We use a DNN backend and study reverberant ASR under three types of mismatch conditions: different room reverberation times, different speaker to microphone distances and the difference between artificially reverberated data and the recordings in a reverberant environment. We show that for these mismatch conditions BFE can provide the targets for a DA. This unsupervised adaptation provides a performance gain over the direct use of BFE and even enables to compensate for the mismatch of real and simulated reverberant data.}},
  author       = {{Heymann, Jahn and Haeb-Umbach, Reinhold and Golik, P. and Schlueter, R.}},
  booktitle    = {{Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on}},
  keywords     = {{codecs, signal denoising, speech recognition, Bayesian feature enhancement, denoising autoencoder, reverberant ASR, single-channel speech recognition, speaker to microphone distances, unsupervised adaptation, Adaptation models, Noise reduction, Reverberation, Speech, Speech recognition, Training, deep neuronal networks, denoising autoencoder, feature enhancement, robust speech recognition}},
  pages        = {{5053--5057}},
  title        = {{{Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions}}},
  doi          = {{10.1109/ICASSP.2015.7178933}},
  year         = {{2015}},
}

@inproceedings{57964,
  author       = {{Pieczewski, Julia and Neuschmelting, Volker and Thiele, Kristina and Grefkes, Christian and Goldbrunner, Roland and Weiss Lucas, Carolin }},
  keywords     = {{610 Medical sciences, Medicine, reliability, speech mapping, TMS}},
  title        = {{{Good retest reliability of the rate of speech errors evoked by 10 Hz navigated repetitive transcranial magnetic stimulation in healthy volunteers: German Medical Science GMS Publishing House}}},
  doi          = {{10.3205/15dgnc394}},
  year         = {{2015}},
}

@inproceedings{11753,
  abstract     = {{This contribution describes a step-wise source counting algorithm to determine the number of speakers in an offline scenario. Each speaker is identified by a variational expectation maximization (VEM) algorithm for complex Watson mixture models and therefore directly yields beamforming vectors for a subsequent speech separation process. An observation selection criterion is proposed which improves the robustness of the source counting in noise. The algorithm is compared to an alternative VEM approach with Gaussian mixture models based on directions of arrival and shown to deliver improved source counting accuracy. The article concludes by extending the offline algorithm towards a low-latency online estimation of the number of active sources from the streaming input data.}},
  author       = {{Drude, Lukas and Chinaev, Aleksej and Tran Vu, Dang Hai and Haeb-Umbach, Reinhold}},
  booktitle    = {{14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)}},
  keywords     = {{Accuracy, Acoustics, Estimation, Mathematical model, Soruce separation, Speech, Vectors, Bayes methods, Blind source separation, Directional statistics, Number of speakers, Speaker diarization}},
  pages        = {{213--217}},
  title        = {{{Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models}}},
  year         = {{2014}},
}

@article{11861,
  abstract     = {{In this contribution we present a theoretical and experimental investigation into the effects of reverberation and noise on features in the logarithmic mel power spectral domain, an intermediate stage in the computation of the mel frequency cepstral coefficients, prevalent in automatic speech recognition (ASR). Gaining insight into the complex interaction between clean speech, noise, and noisy reverberant speech features is essential for any ASR system to be robust against noise and reverberation present in distant microphone input signals. The findings are gathered in a probabilistic formulation of an observation model which may be used in model-based feature compensation schemes. The proposed observation model extends previous models in three major directions: First, the contribution of additive background noise to the observation error is explicitly taken into account. Second, an energy compensation constant is introduced which ensures an unbiased estimate of the reverberant speech features, and, third, a recursive variant of the observation model is developed resulting in reduced computational complexity when used in model-based feature compensation. The experimental section is used to evaluate the accuracy of the model and to describe how its parameters can be determined from test data.}},
  author       = {{Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}},
  issn         = {{2329-9290}},
  journal      = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{computational complexity, reverberation, speech recognition, automatic speech recognition, background noise, clean speech, computational complexity, energy compensation, logarithmic mel power spectral domain, mel frequency cepstral coefficients, microphone input signals, model-based feature compensation schemes, noisy reverberant speech automatic recognition, noisy reverberant speech features, reverberation, Atmospheric modeling, Computational modeling, Noise, Noise measurement, Reverberation, Speech, Vectors, Model-based feature compensation, observation model for reverberant and noisy speech, recursive observation model, robust automatic speech recognition}},
  number       = {{1}},
  pages        = {{95--109}},
  title        = {{{A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech}}},
  doi          = {{10.1109/TASLP.2013.2285480}},
  volume       = {{22}},
  year         = {{2014}},
}

@article{11867,
  abstract     = {{New waves of consumer-centric applications, such as voice search and voice interaction with mobile devices and home entertainment systems, increasingly require automatic speech recognition (ASR) to be robust to the full range of real-world noise and other acoustic distorting conditions. Despite its practical importance, however, the inherent links between and distinctions among the myriad of methods for noise-robust ASR have yet to be carefully studied in order to advance the field further. To this end, it is critical to establish a solid, consistent, and common mathematical foundation for noise-robust ASR, which is lacking at present. This article is intended to fill this gap and to provide a thorough overview of modern noise-robust techniques for ASR developed over the past 30 years. We emphasize methods that are proven to be successful and that are likely to sustain or expand their future applicability. We distill key insights from our comprehensive overview in this field and take a fresh look at a few old problems, which nevertheless are still highly relevant today. Specifically, we have analyzed and categorized a wide range of noise-robust techniques using five different criteria: 1) feature-domain vs. model-domain processing, 2) the use of prior knowledge about the acoustic environment distortion, 3) the use of explicit environment-distortion models, 4) deterministic vs. uncertainty processing, and 5) the use of acoustic models trained jointly with the same feature enhancement or model adaptation process used in the testing stage. With this taxonomy-oriented review, we equip the reader with the insight to choose among techniques and with the awareness of the performance-complexity tradeoffs. The pros and cons of using different noise-robust ASR techniques in practical application scenarios are provided as a guide to interested practitioners. The current challenges and future research directions in this field is also carefully analyzed.}},
  author       = {{Li, Jinyu and Deng, Li and Gong, Yifan and Haeb-Umbach, Reinhold}},
  journal      = {{IEEE Transactions on Audio, Speech and Language Processing}},
  keywords     = {{Speech recognition, compensation, distortion modeling, joint model training, noise, robustness, uncertainty processing}},
  number       = {{4}},
  pages        = {{745--777}},
  title        = {{{An Overview of Noise-Robust Automatic Speech Recognition}}},
  doi          = {{10.1109/TASLP.2014.2304637}},
  volume       = {{22}},
  year         = {{2014}},
}

@inproceedings{11716,
  abstract     = {{The accuracy of automatic speech recognition systems in noisy and reverberant environments can be improved notably by exploiting the uncertainty of the estimated speech features using so-called uncertainty-of-observation techniques. In this paper, we introduce a new Bayesian decision rule that can serve as a mathematical framework from which both known and new uncertainty-of-observation techniques can be either derived or approximated. The new decision rule in its direct form leads to the new significance decoding approach for Gaussian mixture models, which results in better performance compared to standard uncertainty-of-observation techniques in different additive and convolutive noise scenarios.}},
  author       = {{Abdelaziz, Ahmed H. and Zeiler, Steffen and Kolossa, Dorothea and Leutnant, Volker and Haeb-Umbach, Reinhold}},
  booktitle    = {{Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on}},
  issn         = {{1520-6149}},
  keywords     = {{Bayes methods, Gaussian processes, convolution, decision theory, decoding, noise, reverberation, speech coding, speech recognition, Bayesian decision rule, GMM, Gaussian mixture models, additive noise scenarios, automatic speech recognition systems, convolutive noise scenarios, decoding approach, mathematical framework, reverberant environments, significance decoding, speech feature estimation, uncertainty-of-observation techniques, Hidden Markov models, Maximum likelihood decoding, Noise, Speech, Speech recognition, Uncertainty, Uncertainty-of-observation, modified imputation, noise robust speech recognition, significance decoding, uncertainty decoding}},
  pages        = {{6827--6831}},
  title        = {{{GMM-based significance decoding}}},
  doi          = {{10.1109/ICASSP.2013.6638984}},
  year         = {{2013}},
}

@inproceedings{11841,
  abstract     = {{Recently, substantial progress has been made in the field of reverberant speech signal processing, including both single- and multichannel de-reverberation techniques, and automatic speech recognition (ASR) techniques robust to reverberation. To evaluate state-of-the-art algorithms and obtain new insights regarding potential future research directions, we propose a common evaluation framework including datasets, tasks, and evaluation metrics for both speech enhancement and ASR techniques. The proposed framework will be used as a common basis for the REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge. This paper describes the rationale behind the challenge, and provides a detailed description of the evaluation framework and benchmark results.}},
  author       = {{Kinoshita, Keisuke and Delcroix, Marc and Yoshioka, Takuya and Nakatani, Tomohiro and Habets, Emanuel and Haeb-Umbach, Reinhold and Leutnant, Volker and Sehr, Armin and Kellermann, Walter and Maas, Roland and Gannot, Sharon and Raj, Bhiksha}},
  booktitle    = {{ IEEE Workshop on Applications of Signal Processing to Audio and Acoustics }},
  keywords     = {{Reverberant speech, dereverberation, ASR, evaluation, challenge}},
  pages        = {{ 22--23 }},
  title        = {{{The reverb challenge: a common evaluation framework for dereverberation and recognition of reverberant speech}}},
  year         = {{2013}},
}

@article{11862,
  abstract     = {{In this contribution we extend a previously proposed Bayesian approach for the enhancement of reverberant logarithmic mel power spectral coefficients for robust automatic speech recognition to the additional compensation of background noise. A recently proposed observation model is employed whose time-variant observation error statistics are obtained as a side product of the inference of the a posteriori probability density function of the clean speech feature vectors. Further a reduction of the computational effort and the memory requirements are achieved by using a recursive formulation of the observation model. The performance of the proposed algorithms is first experimentally studied on a connected digits recognition task with artificially created noisy reverberant data. It is shown that the use of the time-variant observation error model leads to a significant error rate reduction at low signal-to-noise ratios compared to a time-invariant model. Further experiments were conducted on a 5000 word task recorded in a reverberant and noisy environment. A significant word error rate reduction was obtained demonstrating the effectiveness of the approach on real-world data.}},
  author       = {{Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}},
  journal      = {{IEEE Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{Bayes methods, compensation, error statistics, reverberation, speech recognition, Bayesian feature enhancement, background noise, clean speech feature vectors, compensation, connected digits recognition task, error statistics, memory requirements, noisy reverberant data, posteriori probability density function, recursive formulation, reverberant logarithmic mel power spectral coefficients, robust automatic speech recognition, signal-to-noise ratios, time-variant observation, word error rate reduction, Robust automatic speech recognition, model-based Bayesian feature enhancement, observation model for reverberant and noisy speech, recursive observation model}},
  number       = {{8}},
  pages        = {{1640--1652}},
  title        = {{{Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition}}},
  doi          = {{10.1109/TASL.2013.2258013}},
  volume       = {{21}},
  year         = {{2013}},
}

@inproceedings{11917,
  abstract     = {{In this paper we present a speech presence probability (SPP) estimation algorithmwhich exploits both temporal and spectral correlations of speech. To this end, the SPP estimation is formulated as the posterior probability estimation of the states of a two-dimensional (2D) Hidden Markov Model (HMM). We derive an iterative algorithm to decode the 2D-HMM which is based on the turbo principle. The experimental results show that indeed the SPP estimates improve from iteration to iteration, and further clearly outperform another state-of-the-art SPP estimation algorithm.}},
  author       = {{Vu, Dang Hai Tran and Haeb-Umbach, Reinhold}},
  booktitle    = {{38th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2013)}},
  issn         = {{1520-6149}},
  keywords     = {{correlation methods, estimation theory, hidden Markov models, iterative methods, probability, spectral analysis, speech processing, 2D HMM, SPP estimates, iterative algorithm, posterior probability estimation, spectral correlation, speech presence probability estimation, state-of-the-art SPP estimation algorithm, temporal correlation, turbo principle, two-dimensional hidden Markov model, Correlation, Decoding, Estimation, Iterative decoding, Noise, Speech, Vectors}},
  pages        = {{863--867}},
  title        = {{{Using the turbo principle for exploiting temporal and spectral correlations in speech presence probability estimation}}},
  doi          = {{10.1109/ICASSP.2013.6637771}},
  year         = {{2013}},
}

@inproceedings{11745,
  abstract     = {{In this paper we present a novel noise power spectral density tracking algorithm and its use in single-channel speech enhancement. It has the unique feature that it is able to track the noise statistics even if speech is dominant in a given time-frequency bin. As a consequence it can follow non-stationary noise superposed by speech, even in the critical case of rising noise power. The algorithm requires an initial estimate of the power spectrum of speech and is thus meant to be used as a postprocessor to a first speech enhancement stage. An experimental comparison with a state-of-the-art noise tracking algorithm demonstrates lower estimation errors under low SNR conditions and smaller fluctuations of the estimated values, resulting in improved speech quality as measured by PESQ scores.}},
  author       = {{Chinaev, Aleksej and Krueger, Alexander and Tran Vu, Dang Hai and Haeb-Umbach, Reinhold}},
  booktitle    = {{37th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2012)}},
  keywords     = {{MAP parameter estimation, noise power estimation, speech enhancement}},
  title        = {{{Improved Noise Power Spectral Density Tracking by a MAP-based Postprocessor}}},
  year         = {{2012}},
}

@inproceedings{11864,
  abstract     = {{In this work, an observation model for the joint compensation of noise and reverberation in the logarithmic mel power spectral density domain is considered. It relates the features of the noisy reverberant speech to those of the non-reverberant speech and the noise. In contrast to enhancement of features only corrupted by reverberation (reverberant features), enhancement of noisy reverberant features requires a more sophisticated model for the error introduced by the proposed observation model. In a first consideration, it will be shown that this error is highly dependent on the instantaneous ratio of the power of reverberant speech to the power of the noise and, moreover, sensitive to the phase between reverberant speech and noise in the short-time discrete Fourier domain. Afterwards, a statistically motivated approach will be presented allowing for the model of the observation error to be inferred from the error model previously used for the reverberation only case. Finally, the developed observation error model will be utilized in a Bayesian feature enhancement scheme, leading to improvements in word accuracy on the AURORA5 database.}},
  author       = {{Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}},
  booktitle    = {{Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference on}},
  keywords     = {{Robust Automatic Speech Recognition, Bayesian feature enhancement, observation model for reverberant and noisy speech}},
  title        = {{{A Statistical Observation Model For Noisy Reverberant Speech Features and its Application to Robust ASR}}},
  year         = {{2012}},
}