@inproceedings{11813,
  abstract     = {{The parametric Bayesian Feature Enhancement (BFE) and a datadriven Denoising Autoencoder (DA) both bring performance gains in severe single-channel speech recognition conditions. The first can be adjusted to different conditions by an appropriate parameter setting, while the latter needs to be trained on conditions similar to the ones expected at decoding time, making it vulnerable to a mismatch between training and test conditions. We use a DNN backend and study reverberant ASR under three types of mismatch conditions: different room reverberation times, different speaker to microphone distances and the difference between artificially reverberated data and the recordings in a reverberant environment. We show that for these mismatch conditions BFE can provide the targets for a DA. This unsupervised adaptation provides a performance gain over the direct use of BFE and even enables to compensate for the mismatch of real and simulated reverberant data.}},
  author       = {{Heymann, Jahn and Haeb-Umbach, Reinhold and Golik, P. and Schlueter, R.}},
  booktitle    = {{Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on}},
  keywords     = {{codecs, signal denoising, speech recognition, Bayesian feature enhancement, denoising autoencoder, reverberant ASR, single-channel speech recognition, speaker to microphone distances, unsupervised adaptation, Adaptation models, Noise reduction, Reverberation, Speech, Speech recognition, Training, deep neuronal networks, denoising autoencoder, feature enhancement, robust speech recognition}},
  pages        = {{5053--5057}},
  title        = {{{Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions}}},
  doi          = {{10.1109/ICASSP.2015.7178933}},
  year         = {{2015}},
}

@inproceedings{11841,
  abstract     = {{Recently, substantial progress has been made in the field of reverberant speech signal processing, including both single- and multichannel de-reverberation techniques, and automatic speech recognition (ASR) techniques robust to reverberation. To evaluate state-of-the-art algorithms and obtain new insights regarding potential future research directions, we propose a common evaluation framework including datasets, tasks, and evaluation metrics for both speech enhancement and ASR techniques. The proposed framework will be used as a common basis for the REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge. This paper describes the rationale behind the challenge, and provides a detailed description of the evaluation framework and benchmark results.}},
  author       = {{Kinoshita, Keisuke and Delcroix, Marc and Yoshioka, Takuya and Nakatani, Tomohiro and Habets, Emanuel and Haeb-Umbach, Reinhold and Leutnant, Volker and Sehr, Armin and Kellermann, Walter and Maas, Roland and Gannot, Sharon and Raj, Bhiksha}},
  booktitle    = {{ IEEE Workshop on Applications of Signal Processing to Audio and Acoustics }},
  keywords     = {{Reverberant speech, dereverberation, ASR, evaluation, challenge}},
  pages        = {{ 22--23 }},
  title        = {{{The reverb challenge: a common evaluation framework for dereverberation and recognition of reverberant speech}}},
  year         = {{2013}},
}

@article{11846,
  abstract     = {{In this paper, we present a new technique for automatic speech recognition (ASR) in reverberant environments. Our approach is aimed at the enhancement of the logarithmic Mel power spectrum, which is computed at an intermediate stage to obtain the widely used Mel frequency cepstral coefficients (MFCCs). Given the reverberant logarithmic Mel power spectral coefficients (LMPSCs), a minimum mean square error estimate of the clean LMPSCs is computed by carrying out Bayesian inference. We employ switching linear dynamical models as an a priori model for the dynamics of the clean LMPSCs. Further, we derive a stochastic observation model which relates the clean to the reverberant LMPSCs through a simplified model of the room impulse response (RIR). This model requires only two parameters, namely RIR energy and reverberation time, which can be estimated from the captured microphone signal. The performance of the proposed enhancement technique is studied on the AURORA5 database and compared to that of constrained maximum-likelihood linear regression (CMLLR). It is shown by experimental results that our approach significantly outperforms CMLLR and that up to 80\% of the errors caused by the reverberation are recovered. In addition to the fact that the approach is compatible with the standard MFCC feature vectors, it leaves the ASR back-end unchanged. It is of moderate computational complexity and suitable for real time applications.}},
  author       = {{Krueger, Alexander and Haeb-Umbach, Reinhold}},
  journal      = {{IEEE Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{ASR, AURORA5 database, automatic speech recognition, Bayesian inference, belief networks, CMLLR, computational complexity, constrained maximum likelihood linear regression, least mean squares methods, LMPSC computation, logarithmic Mel power spectrum, maximum likelihood estimation, Mel frequency cepstral coefficients, MFCC feature vectors, microphone signal, minimum mean square error estimation, model-based feature enhancement, regression analysis, reverberant speech recognition, reverberation, RIR energy, room impulse response, speech recognition, stochastic observation model, stochastic processes}},
  number       = {{7}},
  pages        = {{1692--1707}},
  title        = {{{Model-Based Feature Enhancement for Reverberant Speech Recognition}}},
  doi          = {{10.1109/TASL.2010.2049684}},
  volume       = {{18}},
  year         = {{2010}},
}

@article{11820,
  abstract     = {{In this paper, we derive an uncertainty decoding rule for automatic speech recognition (ASR), which accounts for both corrupted observations and inter-frame correlation. The conditional independence assumption, prevalent in hidden Markov model-based ASR, is relaxed to obtain a clean speech posterior that is conditioned on the complete observed feature vector sequence. This is a more informative posterior than one conditioned only on the current observation. The novel decoding is used to obtain a transmission-error robust remote ASR system, where the speech capturing unit is connected to the decoder via an error-prone communication network. We show how the clean speech posterior can be computed for communication links being characterized by either bit errors or packet loss. Recognition results are presented for both distributed and network speech recognition, where in the latter case common voice-over-IP codecs are employed.}},
  author       = {{Ion, Valentin and Haeb-Umbach, Reinhold}},
  journal      = {{IEEE Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{automatic speech recognition, bit errors, codecs, communication links, corrupted observations, decoding, distributed speech recognition, error-prone communication network, feature vector sequence, hidden Markov model-based ASR, hidden Markov models, inter-frame correlation, Internet telephony, network speech recognition, packet loss, speech posterior, speech recognition, transmission error robust speech recognition, uncertainty decoding, voice-over-IP codecs}},
  number       = {{5}},
  pages        = {{1047--1060}},
  title        = {{{A Novel Uncertainty Decoding Rule With Applications to Transmission Error Robust Speech Recognition}}},
  doi          = {{10.1109/TASL.2008.925879}},
  volume       = {{16}},
  year         = {{2008}},
}

