@inproceedings{11813, abstract = {{The parametric Bayesian Feature Enhancement (BFE) and a datadriven Denoising Autoencoder (DA) both bring performance gains in severe single-channel speech recognition conditions. The first can be adjusted to different conditions by an appropriate parameter setting, while the latter needs to be trained on conditions similar to the ones expected at decoding time, making it vulnerable to a mismatch between training and test conditions. We use a DNN backend and study reverberant ASR under three types of mismatch conditions: different room reverberation times, different speaker to microphone distances and the difference between artificially reverberated data and the recordings in a reverberant environment. We show that for these mismatch conditions BFE can provide the targets for a DA. This unsupervised adaptation provides a performance gain over the direct use of BFE and even enables to compensate for the mismatch of real and simulated reverberant data.}}, author = {{Heymann, Jahn and Haeb-Umbach, Reinhold and Golik, P. and Schlueter, R.}}, booktitle = {{Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on}}, keywords = {{codecs, signal denoising, speech recognition, Bayesian feature enhancement, denoising autoencoder, reverberant ASR, single-channel speech recognition, speaker to microphone distances, unsupervised adaptation, Adaptation models, Noise reduction, Reverberation, Speech, Speech recognition, Training, deep neuronal networks, denoising autoencoder, feature enhancement, robust speech recognition}}, pages = {{5053--5057}}, title = {{{Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions}}}, doi = {{10.1109/ICASSP.2015.7178933}}, year = {{2015}}, } @article{11861, abstract = {{In this contribution we present a theoretical and experimental investigation into the effects of reverberation and noise on features in the logarithmic mel power spectral domain, an intermediate stage in the computation of the mel frequency cepstral coefficients, prevalent in automatic speech recognition (ASR). Gaining insight into the complex interaction between clean speech, noise, and noisy reverberant speech features is essential for any ASR system to be robust against noise and reverberation present in distant microphone input signals. The findings are gathered in a probabilistic formulation of an observation model which may be used in model-based feature compensation schemes. The proposed observation model extends previous models in three major directions: First, the contribution of additive background noise to the observation error is explicitly taken into account. Second, an energy compensation constant is introduced which ensures an unbiased estimate of the reverberant speech features, and, third, a recursive variant of the observation model is developed resulting in reduced computational complexity when used in model-based feature compensation. The experimental section is used to evaluate the accuracy of the model and to describe how its parameters can be determined from test data.}}, author = {{Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}}, issn = {{2329-9290}}, journal = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}}, keywords = {{computational complexity, reverberation, speech recognition, automatic speech recognition, background noise, clean speech, computational complexity, energy compensation, logarithmic mel power spectral domain, mel frequency cepstral coefficients, microphone input signals, model-based feature compensation schemes, noisy reverberant speech automatic recognition, noisy reverberant speech features, reverberation, Atmospheric modeling, Computational modeling, Noise, Noise measurement, Reverberation, Speech, Vectors, Model-based feature compensation, observation model for reverberant and noisy speech, recursive observation model, robust automatic speech recognition}}, number = {{1}}, pages = {{95--109}}, title = {{{A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech}}}, doi = {{10.1109/TASLP.2013.2285480}}, volume = {{22}}, year = {{2014}}, } @inproceedings{11716, abstract = {{The accuracy of automatic speech recognition systems in noisy and reverberant environments can be improved notably by exploiting the uncertainty of the estimated speech features using so-called uncertainty-of-observation techniques. In this paper, we introduce a new Bayesian decision rule that can serve as a mathematical framework from which both known and new uncertainty-of-observation techniques can be either derived or approximated. The new decision rule in its direct form leads to the new significance decoding approach for Gaussian mixture models, which results in better performance compared to standard uncertainty-of-observation techniques in different additive and convolutive noise scenarios.}}, author = {{Abdelaziz, Ahmed H. and Zeiler, Steffen and Kolossa, Dorothea and Leutnant, Volker and Haeb-Umbach, Reinhold}}, booktitle = {{Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on}}, issn = {{1520-6149}}, keywords = {{Bayes methods, Gaussian processes, convolution, decision theory, decoding, noise, reverberation, speech coding, speech recognition, Bayesian decision rule, GMM, Gaussian mixture models, additive noise scenarios, automatic speech recognition systems, convolutive noise scenarios, decoding approach, mathematical framework, reverberant environments, significance decoding, speech feature estimation, uncertainty-of-observation techniques, Hidden Markov models, Maximum likelihood decoding, Noise, Speech, Speech recognition, Uncertainty, Uncertainty-of-observation, modified imputation, noise robust speech recognition, significance decoding, uncertainty decoding}}, pages = {{6827--6831}}, title = {{{GMM-based significance decoding}}}, doi = {{10.1109/ICASSP.2013.6638984}}, year = {{2013}}, } @article{11862, abstract = {{In this contribution we extend a previously proposed Bayesian approach for the enhancement of reverberant logarithmic mel power spectral coefficients for robust automatic speech recognition to the additional compensation of background noise. A recently proposed observation model is employed whose time-variant observation error statistics are obtained as a side product of the inference of the a posteriori probability density function of the clean speech feature vectors. Further a reduction of the computational effort and the memory requirements are achieved by using a recursive formulation of the observation model. The performance of the proposed algorithms is first experimentally studied on a connected digits recognition task with artificially created noisy reverberant data. It is shown that the use of the time-variant observation error model leads to a significant error rate reduction at low signal-to-noise ratios compared to a time-invariant model. Further experiments were conducted on a 5000 word task recorded in a reverberant and noisy environment. A significant word error rate reduction was obtained demonstrating the effectiveness of the approach on real-world data.}}, author = {{Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}}, journal = {{IEEE Transactions on Audio, Speech, and Language Processing}}, keywords = {{Bayes methods, compensation, error statistics, reverberation, speech recognition, Bayesian feature enhancement, background noise, clean speech feature vectors, compensation, connected digits recognition task, error statistics, memory requirements, noisy reverberant data, posteriori probability density function, recursive formulation, reverberant logarithmic mel power spectral coefficients, robust automatic speech recognition, signal-to-noise ratios, time-variant observation, word error rate reduction, Robust automatic speech recognition, model-based Bayesian feature enhancement, observation model for reverberant and noisy speech, recursive observation model}}, number = {{8}}, pages = {{1640--1652}}, title = {{{Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition}}}, doi = {{10.1109/TASL.2013.2258013}}, volume = {{21}}, year = {{2013}}, } @article{11850, abstract = {{In this paper, we present a novel blocking matrix and fixed beamformer design for a generalized sidelobe canceler for speech enhancement in a reverberant enclosure. They are based on a new method for estimating the acoustical transfer function ratios in the presence of stationary noise. The estimation method relies on solving a generalized eigenvalue problem in each frequency bin. An adaptive eigenvector tracking utilizing the power iteration method is employed and shown to achieve a high convergence speed. Simulation results demonstrate that the proposed beamformer leads to better noise and interference reduction and reduced speech distortions compared to other blocking matrix designs from the literature.}}, author = {{Krueger, Alexander and Warsitz, Ernst and Haeb-Umbach, Reinhold}}, journal = {{IEEE Transactions on Audio, Speech, and Language Processing}}, keywords = {{acoustical transfer function ratio, adaptive eigenvector tracking, array signal processing, beamformer design, blocking matrix, eigenvalues and eigenfunctions, eigenvector-based transfer function ratios estimation, generalized sidelobe canceler, interference reduction, iterative methods, power iteration method, reduced speech distortions, reverberant enclosure, reverberation, speech enhancement, stationary noise}}, number = {{1}}, pages = {{206--219}}, title = {{{Speech Enhancement With a GSC-Like Structure Employing Eigenvector-Based Transfer Function Ratios Estimation}}}, doi = {{10.1109/TASL.2010.2047324}}, volume = {{19}}, year = {{2011}}, } @article{11846, abstract = {{In this paper, we present a new technique for automatic speech recognition (ASR) in reverberant environments. Our approach is aimed at the enhancement of the logarithmic Mel power spectrum, which is computed at an intermediate stage to obtain the widely used Mel frequency cepstral coefficients (MFCCs). Given the reverberant logarithmic Mel power spectral coefficients (LMPSCs), a minimum mean square error estimate of the clean LMPSCs is computed by carrying out Bayesian inference. We employ switching linear dynamical models as an a priori model for the dynamics of the clean LMPSCs. Further, we derive a stochastic observation model which relates the clean to the reverberant LMPSCs through a simplified model of the room impulse response (RIR). This model requires only two parameters, namely RIR energy and reverberation time, which can be estimated from the captured microphone signal. The performance of the proposed enhancement technique is studied on the AURORA5 database and compared to that of constrained maximum-likelihood linear regression (CMLLR). It is shown by experimental results that our approach significantly outperforms CMLLR and that up to 80\% of the errors caused by the reverberation are recovered. In addition to the fact that the approach is compatible with the standard MFCC feature vectors, it leaves the ASR back-end unchanged. It is of moderate computational complexity and suitable for real time applications.}}, author = {{Krueger, Alexander and Haeb-Umbach, Reinhold}}, journal = {{IEEE Transactions on Audio, Speech, and Language Processing}}, keywords = {{ASR, AURORA5 database, automatic speech recognition, Bayesian inference, belief networks, CMLLR, computational complexity, constrained maximum likelihood linear regression, least mean squares methods, LMPSC computation, logarithmic Mel power spectrum, maximum likelihood estimation, Mel frequency cepstral coefficients, MFCC feature vectors, microphone signal, minimum mean square error estimation, model-based feature enhancement, regression analysis, reverberant speech recognition, reverberation, RIR energy, room impulse response, speech recognition, stochastic observation model, stochastic processes}}, number = {{7}}, pages = {{1692--1707}}, title = {{{Model-Based Feature Enhancement for Reverberant Speech Recognition}}}, doi = {{10.1109/TASL.2010.2049684}}, volume = {{18}}, year = {{2010}}, }