[{"issue":"7","year":"2010","date_created":"2019-07-12T05:29:23Z","title":"Model-Based Feature Enhancement for Reverberant Speech Recognition","publication":"IEEE Transactions on Audio, Speech, and Language Processing","abstract":[{"lang":"eng","text":"In this paper, we present a new technique for automatic speech recognition (ASR) in reverberant environments. Our approach is aimed at the enhancement of the logarithmic Mel power spectrum, which is computed at an intermediate stage to obtain the widely used Mel frequency cepstral coefficients (MFCCs). Given the reverberant logarithmic Mel power spectral coefficients (LMPSCs), a minimum mean square error estimate of the clean LMPSCs is computed by carrying out Bayesian inference. We employ switching linear dynamical models as an a priori model for the dynamics of the clean LMPSCs. Further, we derive a stochastic observation model which relates the clean to the reverberant LMPSCs through a simplified model of the room impulse response (RIR). This model requires only two parameters, namely RIR energy and reverberation time, which can be estimated from the captured microphone signal. The performance of the proposed enhancement technique is studied on the AURORA5 database and compared to that of constrained maximum-likelihood linear regression (CMLLR). It is shown by experimental results that our approach significantly outperforms CMLLR and that up to 80\\% of the errors caused by the reverberation are recovered. In addition to the fact that the approach is compatible with the standard MFCC feature vectors, it leaves the ASR back-end unchanged. It is of moderate computational complexity and suitable for real time applications."}],"language":[{"iso":"eng"}],"keyword":["ASR","AURORA5 database","automatic speech recognition","Bayesian inference","belief networks","CMLLR","computational complexity","constrained maximum likelihood linear regression","least mean squares methods","LMPSC computation","logarithmic Mel power spectrum","maximum likelihood estimation","Mel frequency cepstral coefficients","MFCC feature vectors","microphone signal","minimum mean square error estimation","model-based feature enhancement","regression analysis","reverberant speech recognition","reverberation","RIR energy","room impulse response","speech recognition","stochastic observation model","stochastic processes"],"intvolume":"        18","page":"1692-1707","citation":{"apa":"Krueger, A., &#38; Haeb-Umbach, R. (2010). Model-Based Feature Enhancement for Reverberant Speech Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, <i>18</i>(7), 1692–1707. <a href=\"https://doi.org/10.1109/TASL.2010.2049684\">https://doi.org/10.1109/TASL.2010.2049684</a>","mla":"Krueger, Alexander, and Reinhold Haeb-Umbach. “Model-Based Feature Enhancement for Reverberant Speech Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 18, no. 7, 2010, pp. 1692–707, doi:<a href=\"https://doi.org/10.1109/TASL.2010.2049684\">10.1109/TASL.2010.2049684</a>.","short":"A. Krueger, R. Haeb-Umbach, IEEE Transactions on Audio, Speech, and Language Processing 18 (2010) 1692–1707.","bibtex":"@article{Krueger_Haeb-Umbach_2010, title={Model-Based Feature Enhancement for Reverberant Speech Recognition}, volume={18}, DOI={<a href=\"https://doi.org/10.1109/TASL.2010.2049684\">10.1109/TASL.2010.2049684</a>}, number={7}, journal={IEEE Transactions on Audio, Speech, and Language Processing}, author={Krueger, Alexander and Haeb-Umbach, Reinhold}, year={2010}, pages={1692–1707} }","ieee":"A. Krueger and R. Haeb-Umbach, “Model-Based Feature Enhancement for Reverberant Speech Recognition,” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 18, no. 7, pp. 1692–1707, 2010.","chicago":"Krueger, Alexander, and Reinhold Haeb-Umbach. “Model-Based Feature Enhancement for Reverberant Speech Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i> 18, no. 7 (2010): 1692–1707. <a href=\"https://doi.org/10.1109/TASL.2010.2049684\">https://doi.org/10.1109/TASL.2010.2049684</a>.","ama":"Krueger A, Haeb-Umbach R. Model-Based Feature Enhancement for Reverberant Speech Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>. 2010;18(7):1692-1707. doi:<a href=\"https://doi.org/10.1109/TASL.2010.2049684\">10.1109/TASL.2010.2049684</a>"},"volume":18,"author":[{"last_name":"Krueger","full_name":"Krueger, Alexander","first_name":"Alexander"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"oa":"1","date_updated":"2022-01-06T06:51:11Z","doi":"10.1109/TASL.2010.2049684","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2010/KrHa10.pdf","open_access":"1"}],"type":"journal_article","status":"public","department":[{"_id":"54"}],"user_id":"44006","_id":"11846"},{"oa":"1","date_updated":"2022-01-06T06:51:12Z","date_created":"2019-07-12T05:30:40Z","author":[{"first_name":"Dang Hai","full_name":"Tran Vu, Dang Hai","last_name":"Tran Vu"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"title":"Blind speech separation employing directional statistics in an Expectation Maximization framework","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2010/DaHa10-2.pdf"}],"doi":"10.1109/ICASSP.2010.5495994","year":"2010","citation":{"ama":"Tran Vu DH, Haeb-Umbach R. Blind speech separation employing directional statistics in an Expectation Maximization framework. In: <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2010)</i>. ; 2010:241-244. doi:<a href=\"https://doi.org/10.1109/ICASSP.2010.5495994\">10.1109/ICASSP.2010.5495994</a>","chicago":"Tran Vu, Dang Hai, and Reinhold Haeb-Umbach. “Blind Speech Separation Employing Directional Statistics in an Expectation Maximization Framework.” In <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2010)</i>, 241–44, 2010. <a href=\"https://doi.org/10.1109/ICASSP.2010.5495994\">https://doi.org/10.1109/ICASSP.2010.5495994</a>.","ieee":"D. H. Tran Vu and R. Haeb-Umbach, “Blind speech separation employing directional statistics in an Expectation Maximization framework,” in <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2010)</i>, 2010, pp. 241–244.","apa":"Tran Vu, D. H., &#38; Haeb-Umbach, R. (2010). Blind speech separation employing directional statistics in an Expectation Maximization framework. In <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2010)</i> (pp. 241–244). <a href=\"https://doi.org/10.1109/ICASSP.2010.5495994\">https://doi.org/10.1109/ICASSP.2010.5495994</a>","short":"D.H. Tran Vu, R. Haeb-Umbach, in: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2010), 2010, pp. 241–244.","bibtex":"@inproceedings{Tran Vu_Haeb-Umbach_2010, title={Blind speech separation employing directional statistics in an Expectation Maximization framework}, DOI={<a href=\"https://doi.org/10.1109/ICASSP.2010.5495994\">10.1109/ICASSP.2010.5495994</a>}, booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2010)}, author={Tran Vu, Dang Hai and Haeb-Umbach, Reinhold}, year={2010}, pages={241–244} }","mla":"Tran Vu, Dang Hai, and Reinhold Haeb-Umbach. “Blind Speech Separation Employing Directional Statistics in an Expectation Maximization Framework.” <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2010)</i>, 2010, pp. 241–44, doi:<a href=\"https://doi.org/10.1109/ICASSP.2010.5495994\">10.1109/ICASSP.2010.5495994</a>."},"page":"241-244","_id":"11913","user_id":"44006","department":[{"_id":"54"}],"keyword":["array signal processing","blind source separation","blind speech separation","complex vector space","complex Watson distribution","directional statistics","expectation-maximisation algorithm","expectation maximization algorithm","Fourier transform","Fourier transforms","generalized sidelobe canceller","interference suppression","maximum signal-to-noise ratio beamformer","microphone signal","probabilistic model","spatial aliasing","spatial beamforming configuration","speech enhancement","statistical distributions"],"language":[{"iso":"eng"}],"type":"conference","publication":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2010)","abstract":[{"lang":"eng","text":"In this paper we propose to employ directional statistics in a complex vector space to approach the problem of blind speech separation in the presence of spatially correlated noise. We interpret the values of the short time Fourier transform of the microphone signals to be draws from a mixture of complex Watson distributions, a probabilistic model which naturally accounts for spatial aliasing. The parameters of the density are related to the a priori source probabilities, the power of the sources and the transfer function ratios from sources to sensors. Estimation formulas are derived for these parameters by employing the Expectation Maximization (EM) algorithm. The E-step corresponds to the estimation of the source presence probabilities for each time-frequency bin, while the M-step leads to a maximum signal-to-noise ratio (MaxSNR) beamformer in the presence of uncertainty about the source activity. Experimental results are reported for an implementation in a generalized sidelobe canceller (GSC) like spatial beamforming configuration for 3 speech sources with significant coherent noise in reverberant environments, demonstrating the usefulness of the novel modeling framework."}],"status":"public"},{"year":"2004","citation":{"chicago":"Warsitz, Ernst, and Reinhold Haeb-Umbach. “Robust Speaker Direction Estimation with Particle Filtering.” In <i>IEEE Workshop on Multimedia Signal Processing (MMSP 2004)</i>, 367–70, 2004. <a href=\"https://doi.org/10.1109/MMSP.2004.1436569\">https://doi.org/10.1109/MMSP.2004.1436569</a>.","ieee":"E. Warsitz and R. Haeb-Umbach, “Robust speaker direction estimation with particle filtering,” in <i>IEEE Workshop on Multimedia Signal Processing (MMSP 2004)</i>, 2004, pp. 367–370.","ama":"Warsitz E, Haeb-Umbach R. Robust speaker direction estimation with particle filtering. In: <i>IEEE Workshop on Multimedia Signal Processing (MMSP 2004)</i>. ; 2004:367-370. doi:<a href=\"https://doi.org/10.1109/MMSP.2004.1436569\">10.1109/MMSP.2004.1436569</a>","short":"E. Warsitz, R. Haeb-Umbach, in: IEEE Workshop on Multimedia Signal Processing (MMSP 2004), 2004, pp. 367–370.","bibtex":"@inproceedings{Warsitz_Haeb-Umbach_2004, title={Robust speaker direction estimation with particle filtering}, DOI={<a href=\"https://doi.org/10.1109/MMSP.2004.1436569\">10.1109/MMSP.2004.1436569</a>}, booktitle={IEEE Workshop on Multimedia Signal Processing (MMSP 2004)}, author={Warsitz, Ernst and Haeb-Umbach, Reinhold}, year={2004}, pages={367–370} }","mla":"Warsitz, Ernst, and Reinhold Haeb-Umbach. “Robust Speaker Direction Estimation with Particle Filtering.” <i>IEEE Workshop on Multimedia Signal Processing (MMSP 2004)</i>, 2004, pp. 367–70, doi:<a href=\"https://doi.org/10.1109/MMSP.2004.1436569\">10.1109/MMSP.2004.1436569</a>.","apa":"Warsitz, E., &#38; Haeb-Umbach, R. (2004). Robust speaker direction estimation with particle filtering. In <i>IEEE Workshop on Multimedia Signal Processing (MMSP 2004)</i> (pp. 367–370). <a href=\"https://doi.org/10.1109/MMSP.2004.1436569\">https://doi.org/10.1109/MMSP.2004.1436569</a>"},"page":"367-370","oa":"1","date_updated":"2022-01-06T06:51:12Z","date_created":"2019-07-12T05:31:01Z","author":[{"first_name":"Ernst","full_name":"Warsitz, Ernst","last_name":"Warsitz"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"title":"Robust speaker direction estimation with particle filtering","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2004/WaHa04.pdf"}],"doi":"10.1109/MMSP.2004.1436569","type":"conference","publication":"IEEE Workshop on Multimedia Signal Processing (MMSP 2004)","abstract":[{"lang":"eng","text":"The paper is concerned with binaural signal processing for a bimodal human-robot interface with hearing and vision. The two microphone signals are processed to obtain an enhanced single-channel input signal for the subsequent speech recognizer and to localize the acoustic source, an important information for establishing a natural human-robot communication. We utilize a robust adaptive algorithm for filter-and-sum beamforming (FSB) and extract speaker direction information from the resulting FIR filter coefficients. Further, particle filtering is applied which conducts a nonlinear Bayesian tracking of speaker movement. Good location accuracy can be achieved even in highly reverberant environments. The results obtained outperform the conventional generalized cross correlation (GCC) method."}],"status":"public","_id":"11931","user_id":"44006","department":[{"_id":"54"}],"keyword":["bimodal human-robot interface","binaural signal processing","enhanced single-channel input signal","filter-and-sum beamforming","filtering theory","FIR filter coefficient","generalized cross correlation method","microphones","microphone signal","nonlinear Bayesian tracking","particle filtering","robust adaptive algorithm","robust speaker direction estimation","signal processing","speech enhancement","speech recognition","speech recognizer","user interfaces"],"language":[{"iso":"eng"}]}]