[{"has_accepted_license":"1","related_material":{"link":[{"relation":"software","url":"https://github.com/fgnt/meeteval"}]},"citation":{"apa":"von Neumann, T., Boeddeker, C., Delcroix, M., &#38; Haeb-Umbach, R. (2023). MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems. <i>Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments</i>. CHiME 2023 Workshop on Speech Processing in Everyday Environments, Dublin.","bibtex":"@inproceedings{von Neumann_Boeddeker_Delcroix_Haeb-Umbach_2023, title={MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems}, booktitle={Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments}, author={von Neumann, Thilo and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2023} }","short":"T. von Neumann, C. Boeddeker, M. Delcroix, R. Haeb-Umbach, in: Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments, 2023.","mla":"von Neumann, Thilo, et al. “MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems.” <i>Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments</i>, 2023.","ama":"von Neumann T, Boeddeker C, Delcroix M, Haeb-Umbach R. MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems. In: <i>Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments</i>. ; 2023.","chicago":"Neumann, Thilo von, Christoph Boeddeker, Marc Delcroix, and Reinhold Haeb-Umbach. “MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems.” In <i>Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments</i>, 2023.","ieee":"T. von Neumann, C. Boeddeker, M. Delcroix, and R. Haeb-Umbach, “MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems,” presented at the CHiME 2023 Workshop on Speech Processing in Everyday Environments, Dublin, 2023."},"oa":"1","date_updated":"2025-02-12T09:12:05Z","author":[{"full_name":"von Neumann, Thilo","id":"49870","orcid":"https://orcid.org/0000-0002-7717-8670","last_name":"von Neumann","first_name":"Thilo"},{"last_name":"Boeddeker","id":"40767","full_name":"Boeddeker, Christoph","first_name":"Christoph"},{"full_name":"Delcroix, Marc","last_name":"Delcroix","first_name":"Marc"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"conference":{"name":"CHiME 2023 Workshop on Speech Processing in Everyday Environments","location":"Dublin"},"main_file_link":[{"url":"https://arxiv.org/abs/2307.11394","open_access":"1"}],"type":"conference","status":"public","_id":"48275","project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"},{"_id":"508","name":"Automatische Transkription von Gesprächssituationen","grant_number":"448568305"}],"department":[{"_id":"54"}],"user_id":"40767","file_date_updated":"2023-10-19T07:19:59Z","quality_controlled":"1","year":"2023","date_created":"2023-10-19T07:24:51Z","title":"MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems","publication":"Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments","abstract":[{"text":"MeetEval is an open-source toolkit to evaluate  all kinds of meeting transcription systems.\r\nIt provides a unified interface for the computation of commonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER along other WER definitions.\r\nWe extend the cpWER computation by a temporal constraint to ensure that only words are identified as correct when the temporal alignment is plausible.\r\nThis leads to a better quality of the matching of the hypothesis string to the reference string that more closely resembles the actual transcription quality, and a system is penalized if it provides poor time annotations.\r\nSince word-level timing information is often not available, we present a way to approximate exact word-level timings from segment-level timings (e.g., a sentence) and show that the approximation leads to a similar WER as a matching with exact word-level annotations.\r\nAt the same time, the time constraint leads to a speedup of the matching algorithm, which outweighs the additional overhead caused by processing the time stamps.","lang":"eng"}],"file":[{"relation":"main_file","content_type":"application/pdf","access_level":"open_access","file_name":"Chime_7__MeetEval.pdf","file_id":"48276","file_size":263744,"date_created":"2023-10-19T07:19:59Z","creator":"tvn","date_updated":"2023-10-19T07:19:59Z"}],"keyword":["Speech Recognition","Word Error Rate","Meeting Transcription"],"ddc":["000"],"language":[{"iso":"eng"}]},{"citation":{"chicago":"Neumann, Thilo von, Keisuke Kinoshita, Christoph Boeddeker, Marc Delcroix, and Reinhold Haeb-Umbach. “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers.” In <i>Interspeech 2021</i>, 2021. <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">https://doi.org/10.21437/interspeech.2021-1177</a>.","ieee":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, and R. Haeb-Umbach, “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers,” presented at the Interspeech, 2021, doi: <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>.","ama":"von Neumann T, Kinoshita K, Boeddeker C, Delcroix M, Haeb-Umbach R. Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers. In: <i>Interspeech 2021</i>. ; 2021. doi:<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>","apa":"von Neumann, T., Kinoshita, K., Boeddeker, C., Delcroix, M., &#38; Haeb-Umbach, R. (2021). Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers. <i>Interspeech 2021</i>. Interspeech. <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">https://doi.org/10.21437/interspeech.2021-1177</a>","bibtex":"@inproceedings{von Neumann_Kinoshita_Boeddeker_Delcroix_Haeb-Umbach_2021, title={Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers}, DOI={<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>}, booktitle={Interspeech 2021}, author={von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2021} }","short":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, R. Haeb-Umbach, in: Interspeech 2021, 2021.","mla":"von Neumann, Thilo, et al. “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers.” <i>Interspeech 2021</i>, 2021, doi:<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>."},"related_material":{"link":[{"relation":"software","url":"https://github.com/fgnt/graph_pit"}]},"publication_status":"published","has_accepted_license":"1","conference":{"name":"Interspeech"},"doi":"10.21437/interspeech.2021-1177","author":[{"first_name":"Thilo","full_name":"von Neumann, Thilo","id":"49870","last_name":"von Neumann","orcid":"https://orcid.org/0000-0002-7717-8670"},{"first_name":"Keisuke","last_name":"Kinoshita","full_name":"Kinoshita, Keisuke"},{"full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker","first_name":"Christoph"},{"full_name":"Delcroix, Marc","last_name":"Delcroix","first_name":"Marc"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"date_updated":"2023-11-15T12:14:40Z","oa":"1","status":"public","type":"conference","file_date_updated":"2021-12-06T10:48:30Z","user_id":"49870","department":[{"_id":"54"}],"project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"26770","year":"2021","quality_controlled":"1","title":"Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers","date_created":"2021-10-25T08:50:01Z","file":[{"creator":"tvn","date_created":"2021-12-06T10:39:13Z","date_updated":"2021-12-06T10:48:30Z","access_level":"open_access","file_id":"28327","file_name":"Interspeech 2021 voiceover-002-compressed.mp4","title":"Video for INTERSPEECH 2021","file_size":9550220,"content_type":"video/mp4","relation":"supplementary_material"},{"date_updated":"2021-12-06T10:47:01Z","creator":"tvn","date_created":"2021-12-06T10:47:01Z","title":"Slides from INTERSPEECH 2021","file_size":1337297,"access_level":"open_access","file_id":"28328","file_name":"Graph-PIT-poster-presentation.pptx","content_type":"application/vnd.openxmlformats-officedocument.presentationml.presentation","relation":"slides"},{"file_size":226589,"access_level":"open_access","file_name":"INTERSPEECH2021_Graph_PIT.pdf","file_id":"28329","date_updated":"2021-12-06T10:48:21Z","creator":"tvn","date_created":"2021-12-06T10:48:21Z","relation":"main_file","content_type":"application/pdf"}],"abstract":[{"text":"Automatic transcription of meetings requires handling of overlapped speech, which calls for continuous speech separation (CSS) systems. The uPIT criterion was proposed for utterance-level separation with neural networks and introduces the constraint that the total number of speakers must not exceed the number of output channels. When processing meeting-like data in a segment-wise manner, i.e., by separating overlapping segments independently and stitching adjacent segments to continuous output streams, this constraint has to be fulfilled for any segment. In this contribution, we show that this constraint can be significantly relaxed. We propose a novel graph-based PIT criterion, which casts the assignment of utterances to output channels in a graph coloring problem. It only requires that the number of concurrently active speakers must not exceed the number of output channels. As a consequence, the system can process an arbitrary number of speakers and arbitrarily long segments and thus can handle more diverse scenarios.\r\nFurther, the stitching algorithm for obtaining a consistent output order in neighboring segments is of less importance and can even be eliminated completely, not the least reducing the computational effort. Experiments on meeting-style WSJ data show improvements in recognition performance over using the uPIT criterion. ","lang":"eng"}],"publication":"Interspeech 2021","language":[{"iso":"eng"}],"ddc":["000"],"keyword":["Continuous speech separation","automatic speech recognition","overlapped speech","permutation invariant training"]},{"date_created":"2019-07-12T05:28:45Z","author":[{"last_name":"Heymann","id":"9168","full_name":"Heymann, Jahn","first_name":"Jahn"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"},{"full_name":"Golik, P.","last_name":"Golik","first_name":"P."},{"full_name":"Schlueter, R.","last_name":"Schlueter","first_name":"R."}],"date_updated":"2022-01-06T06:51:09Z","oa":"1","doi":"10.1109/ICASSP.2015.7178933","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2015/hey_icassp_2015.pdf","open_access":"1"}],"title":"Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions","page":"5053-5057","citation":{"apa":"Heymann, J., Haeb-Umbach, R., Golik, P., &#38; Schlueter, R. (2015). Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions. In <i>Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on</i> (pp. 5053–5057). <a href=\"https://doi.org/10.1109/ICASSP.2015.7178933\">https://doi.org/10.1109/ICASSP.2015.7178933</a>","bibtex":"@inproceedings{Heymann_Haeb-Umbach_Golik_Schlueter_2015, title={Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions}, DOI={<a href=\"https://doi.org/10.1109/ICASSP.2015.7178933\">10.1109/ICASSP.2015.7178933</a>}, booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on}, author={Heymann, Jahn and Haeb-Umbach, Reinhold and Golik, P. and Schlueter, R.}, year={2015}, pages={5053–5057} }","mla":"Heymann, Jahn, et al. “Unsupervised Adaptation of a Denoising Autoencoder by Bayesian Feature Enhancement for Reverberant Asr under Mismatch Conditions.” <i>Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference On</i>, 2015, pp. 5053–57, doi:<a href=\"https://doi.org/10.1109/ICASSP.2015.7178933\">10.1109/ICASSP.2015.7178933</a>.","short":"J. Heymann, R. Haeb-Umbach, P. Golik, R. Schlueter, in: Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference On, 2015, pp. 5053–5057.","ieee":"J. Heymann, R. Haeb-Umbach, P. Golik, and R. Schlueter, “Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions,” in <i>Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on</i>, 2015, pp. 5053–5057.","chicago":"Heymann, Jahn, Reinhold Haeb-Umbach, P. Golik, and R. Schlueter. “Unsupervised Adaptation of a Denoising Autoencoder by Bayesian Feature Enhancement for Reverberant Asr under Mismatch Conditions.” In <i>Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference On</i>, 5053–57, 2015. <a href=\"https://doi.org/10.1109/ICASSP.2015.7178933\">https://doi.org/10.1109/ICASSP.2015.7178933</a>.","ama":"Heymann J, Haeb-Umbach R, Golik P, Schlueter R. Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions. In: <i>Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference On</i>. ; 2015:5053-5057. doi:<a href=\"https://doi.org/10.1109/ICASSP.2015.7178933\">10.1109/ICASSP.2015.7178933</a>"},"year":"2015","department":[{"_id":"54"}],"user_id":"44006","_id":"11813","language":[{"iso":"eng"}],"keyword":["codecs","signal denoising","speech recognition","Bayesian feature enhancement","denoising autoencoder","reverberant ASR","single-channel speech recognition","speaker to microphone distances","unsupervised adaptation","Adaptation models","Noise reduction","Reverberation","Speech","Speech recognition","Training","deep neuronal networks","denoising autoencoder","feature enhancement","robust speech recognition"],"publication":"Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on","type":"conference","status":"public","abstract":[{"lang":"eng","text":"The parametric Bayesian Feature Enhancement (BFE) and a datadriven Denoising Autoencoder (DA) both bring performance gains in severe single-channel speech recognition conditions. The first can be adjusted to different conditions by an appropriate parameter setting, while the latter needs to be trained on conditions similar to the ones expected at decoding time, making it vulnerable to a mismatch between training and test conditions. We use a DNN backend and study reverberant ASR under three types of mismatch conditions: different room reverberation times, different speaker to microphone distances and the difference between artificially reverberated data and the recordings in a reverberant environment. We show that for these mismatch conditions BFE can provide the targets for a DA. This unsupervised adaptation provides a performance gain over the direct use of BFE and even enables to compensate for the mismatch of real and simulated reverberant data."}]},{"date_updated":"2022-01-06T06:51:11Z","author":[{"full_name":"Leutnant, Volker","last_name":"Leutnant","first_name":"Volker"},{"first_name":"Alexander","full_name":"Krueger, Alexander","last_name":"Krueger"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"date_created":"2019-07-12T05:29:41Z","volume":22,"title":"A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech","doi":"10.1109/TASLP.2013.2285480","publication_identifier":{"issn":["2329-9290"]},"issue":"1","year":"2014","citation":{"ama":"Leutnant V, Krueger A, Haeb-Umbach R. A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>. 2014;22(1):95-109. doi:<a href=\"https://doi.org/10.1109/TASLP.2013.2285480\">10.1109/TASLP.2013.2285480</a>","chicago":"Leutnant, Volker, Alexander Krueger, and Reinhold Haeb-Umbach. “A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i> 22, no. 1 (2014): 95–109. <a href=\"https://doi.org/10.1109/TASLP.2013.2285480\">https://doi.org/10.1109/TASLP.2013.2285480</a>.","ieee":"V. Leutnant, A. Krueger, and R. Haeb-Umbach, “A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech,” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, vol. 22, no. 1, pp. 95–109, 2014.","apa":"Leutnant, V., Krueger, A., &#38; Haeb-Umbach, R. (2014). A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, <i>22</i>(1), 95–109. <a href=\"https://doi.org/10.1109/TASLP.2013.2285480\">https://doi.org/10.1109/TASLP.2013.2285480</a>","mla":"Leutnant, Volker, et al. “A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, vol. 22, no. 1, 2014, pp. 95–109, doi:<a href=\"https://doi.org/10.1109/TASLP.2013.2285480\">10.1109/TASLP.2013.2285480</a>.","bibtex":"@article{Leutnant_Krueger_Haeb-Umbach_2014, title={A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech}, volume={22}, DOI={<a href=\"https://doi.org/10.1109/TASLP.2013.2285480\">10.1109/TASLP.2013.2285480</a>}, number={1}, journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, author={Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}, year={2014}, pages={95–109} }","short":"V. Leutnant, A. Krueger, R. Haeb-Umbach, IEEE/ACM Transactions on Audio, Speech, and Language Processing 22 (2014) 95–109."},"intvolume":"        22","page":"95-109","_id":"11861","user_id":"44006","department":[{"_id":"54"}],"keyword":["computational complexity","reverberation","speech recognition","automatic speech recognition","background noise","clean speech","computational complexity","energy compensation","logarithmic mel power spectral domain","mel frequency cepstral coefficients","microphone input signals","model-based feature compensation schemes","noisy reverberant speech automatic recognition","noisy reverberant speech features","reverberation","Atmospheric modeling","Computational modeling","Noise","Noise measurement","Reverberation","Speech","Vectors","Model-based feature compensation","observation model for reverberant and noisy speech","recursive observation model","robust automatic speech recognition"],"language":[{"iso":"eng"}],"type":"journal_article","publication":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","abstract":[{"text":"In this contribution we present a theoretical and experimental investigation into the effects of reverberation and noise on features in the logarithmic mel power spectral domain, an intermediate stage in the computation of the mel frequency cepstral coefficients, prevalent in automatic speech recognition (ASR). Gaining insight into the complex interaction between clean speech, noise, and noisy reverberant speech features is essential for any ASR system to be robust against noise and reverberation present in distant microphone input signals. The findings are gathered in a probabilistic formulation of an observation model which may be used in model-based feature compensation schemes. The proposed observation model extends previous models in three major directions: First, the contribution of additive background noise to the observation error is explicitly taken into account. Second, an energy compensation constant is introduced which ensures an unbiased estimate of the reverberant speech features, and, third, a recursive variant of the observation model is developed resulting in reduced computational complexity when used in model-based feature compensation. The experimental section is used to evaluate the accuracy of the model and to describe how its parameters can be determined from test data.","lang":"eng"}],"status":"public"},{"publication":"IEEE Transactions on Audio, Speech and Language Processing","type":"journal_article","abstract":[{"lang":"eng","text":"New waves of consumer-centric applications, such as voice search and voice interaction with mobile devices and home entertainment systems, increasingly require automatic speech recognition (ASR) to be robust to the full range of real-world noise and other acoustic distorting conditions. Despite its practical importance, however, the inherent links between and distinctions among the myriad of methods for noise-robust ASR have yet to be carefully studied in order to advance the field further. To this end, it is critical to establish a solid, consistent, and common mathematical foundation for noise-robust ASR, which is lacking at present. This article is intended to fill this gap and to provide a thorough overview of modern noise-robust techniques for ASR developed over the past 30 years. We emphasize methods that are proven to be successful and that are likely to sustain or expand their future applicability. We distill key insights from our comprehensive overview in this field and take a fresh look at a few old problems, which nevertheless are still highly relevant today. Specifically, we have analyzed and categorized a wide range of noise-robust techniques using five different criteria: 1) feature-domain vs. model-domain processing, 2) the use of prior knowledge about the acoustic environment distortion, 3) the use of explicit environment-distortion models, 4) deterministic vs. uncertainty processing, and 5) the use of acoustic models trained jointly with the same feature enhancement or model adaptation process used in the testing stage. With this taxonomy-oriented review, we equip the reader with the insight to choose among techniques and with the awareness of the performance-complexity tradeoffs. The pros and cons of using different noise-robust ASR techniques in practical application scenarios are provided as a guide to interested practitioners. The current challenges and future research directions in this field is also carefully analyzed."}],"status":"public","_id":"11867","department":[{"_id":"54"}],"user_id":"44006","keyword":["Speech recognition","compensation","distortion modeling","joint model training","noise","robustness","uncertainty processing"],"language":[{"iso":"eng"}],"issue":"4","year":"2014","page":"745-777","intvolume":"        22","citation":{"ama":"Li J, Deng L, Gong Y, Haeb-Umbach R. An Overview of Noise-Robust Automatic Speech Recognition. <i>IEEE Transactions on Audio, Speech and Language Processing</i>. 2014;22(4):745-777. doi:<a href=\"https://doi.org/10.1109/TASLP.2014.2304637\">10.1109/TASLP.2014.2304637</a>","chicago":"Li, Jinyu, Li Deng, Yifan Gong, and Reinhold Haeb-Umbach. “An Overview of Noise-Robust Automatic Speech Recognition.” <i>IEEE Transactions on Audio, Speech and Language Processing</i> 22, no. 4 (2014): 745–77. <a href=\"https://doi.org/10.1109/TASLP.2014.2304637\">https://doi.org/10.1109/TASLP.2014.2304637</a>.","ieee":"J. Li, L. Deng, Y. Gong, and R. Haeb-Umbach, “An Overview of Noise-Robust Automatic Speech Recognition,” <i>IEEE Transactions on Audio, Speech and Language Processing</i>, vol. 22, no. 4, pp. 745–777, 2014.","bibtex":"@article{Li_Deng_Gong_Haeb-Umbach_2014, title={An Overview of Noise-Robust Automatic Speech Recognition}, volume={22}, DOI={<a href=\"https://doi.org/10.1109/TASLP.2014.2304637\">10.1109/TASLP.2014.2304637</a>}, number={4}, journal={IEEE Transactions on Audio, Speech and Language Processing}, author={Li, Jinyu and Deng, Li and Gong, Yifan and Haeb-Umbach, Reinhold}, year={2014}, pages={745–777} }","mla":"Li, Jinyu, et al. “An Overview of Noise-Robust Automatic Speech Recognition.” <i>IEEE Transactions on Audio, Speech and Language Processing</i>, vol. 22, no. 4, 2014, pp. 745–77, doi:<a href=\"https://doi.org/10.1109/TASLP.2014.2304637\">10.1109/TASLP.2014.2304637</a>.","short":"J. Li, L. Deng, Y. Gong, R. Haeb-Umbach, IEEE Transactions on Audio, Speech and Language Processing 22 (2014) 745–777.","apa":"Li, J., Deng, L., Gong, Y., &#38; Haeb-Umbach, R. (2014). An Overview of Noise-Robust Automatic Speech Recognition. <i>IEEE Transactions on Audio, Speech and Language Processing</i>, <i>22</i>(4), 745–777. <a href=\"https://doi.org/10.1109/TASLP.2014.2304637\">https://doi.org/10.1109/TASLP.2014.2304637</a>"},"date_updated":"2022-01-06T06:51:11Z","oa":"1","volume":22,"date_created":"2019-07-12T05:29:47Z","author":[{"first_name":"Jinyu","last_name":"Li","full_name":"Li, Jinyu"},{"last_name":"Deng","full_name":"Deng, Li","first_name":"Li"},{"first_name":"Yifan","full_name":"Gong, Yifan","last_name":"Gong"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"title":"An Overview of Noise-Robust Automatic Speech Recognition","doi":"10.1109/TASLP.2014.2304637","main_file_link":[{"open_access":"1","url":"http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6732927"}]},{"year":"2013","citation":{"bibtex":"@inproceedings{Abdelaziz_Zeiler_Kolossa_Leutnant_Haeb-Umbach_2013, title={GMM-based significance decoding}, DOI={<a href=\"https://doi.org/10.1109/ICASSP.2013.6638984\">10.1109/ICASSP.2013.6638984</a>}, booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on}, author={Abdelaziz, Ahmed H. and Zeiler, Steffen and Kolossa, Dorothea and Leutnant, Volker and Haeb-Umbach, Reinhold}, year={2013}, pages={6827–6831} }","mla":"Abdelaziz, Ahmed H., et al. “GMM-Based Significance Decoding.” <i>Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference On</i>, 2013, pp. 6827–31, doi:<a href=\"https://doi.org/10.1109/ICASSP.2013.6638984\">10.1109/ICASSP.2013.6638984</a>.","short":"A.H. Abdelaziz, S. Zeiler, D. Kolossa, V. Leutnant, R. Haeb-Umbach, in: Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference On, 2013, pp. 6827–6831.","apa":"Abdelaziz, A. H., Zeiler, S., Kolossa, D., Leutnant, V., &#38; Haeb-Umbach, R. (2013). GMM-based significance decoding. In <i>Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on</i> (pp. 6827–6831). <a href=\"https://doi.org/10.1109/ICASSP.2013.6638984\">https://doi.org/10.1109/ICASSP.2013.6638984</a>","ama":"Abdelaziz AH, Zeiler S, Kolossa D, Leutnant V, Haeb-Umbach R. GMM-based significance decoding. In: <i>Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference On</i>. ; 2013:6827-6831. doi:<a href=\"https://doi.org/10.1109/ICASSP.2013.6638984\">10.1109/ICASSP.2013.6638984</a>","ieee":"A. H. Abdelaziz, S. Zeiler, D. Kolossa, V. Leutnant, and R. Haeb-Umbach, “GMM-based significance decoding,” in <i>Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on</i>, 2013, pp. 6827–6831.","chicago":"Abdelaziz, Ahmed H., Steffen Zeiler, Dorothea Kolossa, Volker Leutnant, and Reinhold Haeb-Umbach. “GMM-Based Significance Decoding.” In <i>Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference On</i>, 6827–31, 2013. <a href=\"https://doi.org/10.1109/ICASSP.2013.6638984\">https://doi.org/10.1109/ICASSP.2013.6638984</a>."},"page":"6827-6831","publication_identifier":{"issn":["1520-6149"]},"title":"GMM-based significance decoding","doi":"10.1109/ICASSP.2013.6638984","date_updated":"2022-01-06T06:51:07Z","date_created":"2019-07-12T05:26:53Z","author":[{"last_name":"Abdelaziz","full_name":"Abdelaziz, Ahmed H.","first_name":"Ahmed H."},{"full_name":"Zeiler, Steffen","last_name":"Zeiler","first_name":"Steffen"},{"first_name":"Dorothea","full_name":"Kolossa, Dorothea","last_name":"Kolossa"},{"first_name":"Volker","last_name":"Leutnant","full_name":"Leutnant, Volker"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"abstract":[{"lang":"eng","text":"The accuracy of automatic speech recognition systems in noisy and reverberant environments can be improved notably by exploiting the uncertainty of the estimated speech features using so-called uncertainty-of-observation techniques. In this paper, we introduce a new Bayesian decision rule that can serve as a mathematical framework from which both known and new uncertainty-of-observation techniques can be either derived or approximated. The new decision rule in its direct form leads to the new significance decoding approach for Gaussian mixture models, which results in better performance compared to standard uncertainty-of-observation techniques in different additive and convolutive noise scenarios."}],"status":"public","type":"conference","publication":"Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on","keyword":["Bayes methods","Gaussian processes","convolution","decision theory","decoding","noise","reverberation","speech coding","speech recognition","Bayesian decision rule","GMM","Gaussian mixture models","additive noise scenarios","automatic speech recognition systems","convolutive noise scenarios","decoding approach","mathematical framework","reverberant environments","significance decoding","speech feature estimation","uncertainty-of-observation techniques","Hidden Markov models","Maximum likelihood decoding","Noise","Speech","Speech recognition","Uncertainty","Uncertainty-of-observation","modified imputation","noise robust speech recognition","significance decoding","uncertainty decoding"],"language":[{"iso":"eng"}],"_id":"11716","user_id":"44006","department":[{"_id":"54"}]},{"issue":"8","year":"2013","intvolume":"        21","page":"1640-1652","citation":{"short":"V. Leutnant, A. Krueger, R. Haeb-Umbach, IEEE Transactions on Audio, Speech, and Language Processing 21 (2013) 1640–1652.","bibtex":"@article{Leutnant_Krueger_Haeb-Umbach_2013, title={Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition}, volume={21}, DOI={<a href=\"https://doi.org/10.1109/TASL.2013.2258013\">10.1109/TASL.2013.2258013</a>}, number={8}, journal={IEEE Transactions on Audio, Speech, and Language Processing}, author={Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}, year={2013}, pages={1640–1652} }","mla":"Leutnant, Volker, et al. “Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 21, no. 8, 2013, pp. 1640–52, doi:<a href=\"https://doi.org/10.1109/TASL.2013.2258013\">10.1109/TASL.2013.2258013</a>.","apa":"Leutnant, V., Krueger, A., &#38; Haeb-Umbach, R. (2013). Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, <i>21</i>(8), 1640–1652. <a href=\"https://doi.org/10.1109/TASL.2013.2258013\">https://doi.org/10.1109/TASL.2013.2258013</a>","chicago":"Leutnant, Volker, Alexander Krueger, and Reinhold Haeb-Umbach. “Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i> 21, no. 8 (2013): 1640–52. <a href=\"https://doi.org/10.1109/TASL.2013.2258013\">https://doi.org/10.1109/TASL.2013.2258013</a>.","ieee":"V. Leutnant, A. Krueger, and R. Haeb-Umbach, “Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition,” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 21, no. 8, pp. 1640–1652, 2013.","ama":"Leutnant V, Krueger A, Haeb-Umbach R. Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>. 2013;21(8):1640-1652. doi:<a href=\"https://doi.org/10.1109/TASL.2013.2258013\">10.1109/TASL.2013.2258013</a>"},"date_updated":"2022-01-06T06:51:11Z","volume":21,"date_created":"2019-07-12T05:29:42Z","author":[{"last_name":"Leutnant","full_name":"Leutnant, Volker","first_name":"Volker"},{"last_name":"Krueger","full_name":"Krueger, Alexander","first_name":"Alexander"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"title":"Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition","doi":"10.1109/TASL.2013.2258013","publication":"IEEE Transactions on Audio, Speech, and Language Processing","type":"journal_article","abstract":[{"lang":"eng","text":"In this contribution we extend a previously proposed Bayesian approach for the enhancement of reverberant logarithmic mel power spectral coefficients for robust automatic speech recognition to the additional compensation of background noise. A recently proposed observation model is employed whose time-variant observation error statistics are obtained as a side product of the inference of the a posteriori probability density function of the clean speech feature vectors. Further a reduction of the computational effort and the memory requirements are achieved by using a recursive formulation of the observation model. The performance of the proposed algorithms is first experimentally studied on a connected digits recognition task with artificially created noisy reverberant data. It is shown that the use of the time-variant observation error model leads to a significant error rate reduction at low signal-to-noise ratios compared to a time-invariant model. Further experiments were conducted on a 5000 word task recorded in a reverberant and noisy environment. A significant word error rate reduction was obtained demonstrating the effectiveness of the approach on real-world data."}],"status":"public","_id":"11862","department":[{"_id":"54"}],"user_id":"44006","keyword":["Bayes methods","compensation","error statistics","reverberation","speech recognition","Bayesian feature enhancement","background noise","clean speech feature vectors","compensation","connected digits recognition task","error statistics","memory requirements","noisy reverberant data","posteriori probability density function","recursive formulation","reverberant logarithmic mel power spectral coefficients","robust automatic speech recognition","signal-to-noise ratios","time-variant observation","word error rate reduction","Robust automatic speech recognition","model-based Bayesian feature enhancement","observation model for reverberant and noisy speech","recursive observation model"],"language":[{"iso":"eng"}]},{"oa":"1","date_updated":"2022-01-06T06:51:11Z","author":[{"last_name":"Leutnant","full_name":"Leutnant, Volker","first_name":"Volker"},{"last_name":"Krueger","full_name":"Krueger, Alexander","first_name":"Alexander"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"date_created":"2019-07-12T05:29:44Z","title":"A Statistical Observation Model For Noisy Reverberant Speech Features and its Application to Robust ASR","main_file_link":[{"open_access":"1","url":"http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6335731"}],"year":"2012","citation":{"mla":"Leutnant, Volker, et al. “A Statistical Observation Model For Noisy Reverberant Speech Features and Its Application to Robust ASR.” <i>Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference On</i>, 2012.","short":"V. Leutnant, A. Krueger, R. Haeb-Umbach, in: Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference On, 2012.","bibtex":"@inproceedings{Leutnant_Krueger_Haeb-Umbach_2012, title={A Statistical Observation Model For Noisy Reverberant Speech Features and its Application to Robust ASR}, booktitle={Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference on}, author={Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}, year={2012} }","apa":"Leutnant, V., Krueger, A., &#38; Haeb-Umbach, R. (2012). A Statistical Observation Model For Noisy Reverberant Speech Features and its Application to Robust ASR. In <i>Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference on</i>.","ieee":"V. Leutnant, A. Krueger, and R. Haeb-Umbach, “A Statistical Observation Model For Noisy Reverberant Speech Features and its Application to Robust ASR,” in <i>Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference on</i>, 2012.","chicago":"Leutnant, Volker, Alexander Krueger, and Reinhold Haeb-Umbach. “A Statistical Observation Model For Noisy Reverberant Speech Features and Its Application to Robust ASR.” In <i>Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference On</i>, 2012.","ama":"Leutnant V, Krueger A, Haeb-Umbach R. A Statistical Observation Model For Noisy Reverberant Speech Features and its Application to Robust ASR. In: <i>Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference On</i>. ; 2012."},"_id":"11864","user_id":"44006","department":[{"_id":"54"}],"keyword":["Robust Automatic Speech Recognition","Bayesian feature enhancement","observation model for reverberant and noisy speech"],"language":[{"iso":"eng"}],"type":"conference","publication":"Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference on","abstract":[{"lang":"eng","text":"In this work, an observation model for the joint compensation of noise and reverberation in the logarithmic mel power spectral density domain is considered. It relates the features of the noisy reverberant speech to those of the non-reverberant speech and the noise. In contrast to enhancement of features only corrupted by reverberation (reverberant features), enhancement of noisy reverberant features requires a more sophisticated model for the error introduced by the proposed observation model. In a first consideration, it will be shown that this error is highly dependent on the instantaneous ratio of the power of reverberant speech to the power of the noise and, moreover, sensitive to the phase between reverberant speech and noise in the short-time discrete Fourier domain. Afterwards, a statistically motivated approach will be presented allowing for the model of the observation error to be inferred from the error model previously used for the reverberation only case. Finally, the developed observation error model will be utilized in a Bayesian feature enhancement scheme, leading to improvements in word accuracy on the AURORA5 database."}],"status":"public"},{"volume":18,"author":[{"first_name":"Alexander","last_name":"Krueger","full_name":"Krueger, Alexander"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_updated":"2022-01-06T06:51:11Z","oa":"1","doi":"10.1109/TASL.2010.2049684","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2010/KrHa10.pdf"}],"page":"1692-1707","intvolume":"        18","citation":{"ama":"Krueger A, Haeb-Umbach R. Model-Based Feature Enhancement for Reverberant Speech Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>. 2010;18(7):1692-1707. doi:<a href=\"https://doi.org/10.1109/TASL.2010.2049684\">10.1109/TASL.2010.2049684</a>","ieee":"A. Krueger and R. Haeb-Umbach, “Model-Based Feature Enhancement for Reverberant Speech Recognition,” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 18, no. 7, pp. 1692–1707, 2010.","chicago":"Krueger, Alexander, and Reinhold Haeb-Umbach. “Model-Based Feature Enhancement for Reverberant Speech Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i> 18, no. 7 (2010): 1692–1707. <a href=\"https://doi.org/10.1109/TASL.2010.2049684\">https://doi.org/10.1109/TASL.2010.2049684</a>.","apa":"Krueger, A., &#38; Haeb-Umbach, R. (2010). Model-Based Feature Enhancement for Reverberant Speech Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, <i>18</i>(7), 1692–1707. <a href=\"https://doi.org/10.1109/TASL.2010.2049684\">https://doi.org/10.1109/TASL.2010.2049684</a>","short":"A. Krueger, R. Haeb-Umbach, IEEE Transactions on Audio, Speech, and Language Processing 18 (2010) 1692–1707.","mla":"Krueger, Alexander, and Reinhold Haeb-Umbach. “Model-Based Feature Enhancement for Reverberant Speech Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 18, no. 7, 2010, pp. 1692–707, doi:<a href=\"https://doi.org/10.1109/TASL.2010.2049684\">10.1109/TASL.2010.2049684</a>.","bibtex":"@article{Krueger_Haeb-Umbach_2010, title={Model-Based Feature Enhancement for Reverberant Speech Recognition}, volume={18}, DOI={<a href=\"https://doi.org/10.1109/TASL.2010.2049684\">10.1109/TASL.2010.2049684</a>}, number={7}, journal={IEEE Transactions on Audio, Speech, and Language Processing}, author={Krueger, Alexander and Haeb-Umbach, Reinhold}, year={2010}, pages={1692–1707} }"},"department":[{"_id":"54"}],"user_id":"44006","_id":"11846","type":"journal_article","status":"public","date_created":"2019-07-12T05:29:23Z","title":"Model-Based Feature Enhancement for Reverberant Speech Recognition","issue":"7","year":"2010","language":[{"iso":"eng"}],"keyword":["ASR","AURORA5 database","automatic speech recognition","Bayesian inference","belief networks","CMLLR","computational complexity","constrained maximum likelihood linear regression","least mean squares methods","LMPSC computation","logarithmic Mel power spectrum","maximum likelihood estimation","Mel frequency cepstral coefficients","MFCC feature vectors","microphone signal","minimum mean square error estimation","model-based feature enhancement","regression analysis","reverberant speech recognition","reverberation","RIR energy","room impulse response","speech recognition","stochastic observation model","stochastic processes"],"publication":"IEEE Transactions on Audio, Speech, and Language Processing","abstract":[{"text":"In this paper, we present a new technique for automatic speech recognition (ASR) in reverberant environments. Our approach is aimed at the enhancement of the logarithmic Mel power spectrum, which is computed at an intermediate stage to obtain the widely used Mel frequency cepstral coefficients (MFCCs). Given the reverberant logarithmic Mel power spectral coefficients (LMPSCs), a minimum mean square error estimate of the clean LMPSCs is computed by carrying out Bayesian inference. We employ switching linear dynamical models as an a priori model for the dynamics of the clean LMPSCs. Further, we derive a stochastic observation model which relates the clean to the reverberant LMPSCs through a simplified model of the room impulse response (RIR). This model requires only two parameters, namely RIR energy and reverberation time, which can be estimated from the captured microphone signal. The performance of the proposed enhancement technique is studied on the AURORA5 database and compared to that of constrained maximum-likelihood linear regression (CMLLR). It is shown by experimental results that our approach significantly outperforms CMLLR and that up to 80\\% of the errors caused by the reverberation are recovered. In addition to the fact that the approach is compatible with the standard MFCC feature vectors, it leaves the ASR back-end unchanged. It is of moderate computational complexity and suitable for real time applications.","lang":"eng"}]},{"volume":17,"author":[{"first_name":"Stefan","full_name":"Windmann, Stefan","last_name":"Windmann"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"oa":"1","date_updated":"2022-01-06T06:51:12Z","doi":"10.1109/TASL.2009.2014894","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2009/WiHa09-1.pdf","open_access":"1"}],"page":"974-984","intvolume":"        17","citation":{"bibtex":"@article{Windmann_Haeb-Umbach_2009, title={Approaches to Iterative Speech Feature Enhancement and Recognition}, volume={17}, DOI={<a href=\"https://doi.org/10.1109/TASL.2009.2014894\">10.1109/TASL.2009.2014894</a>}, number={5}, journal={IEEE Transactions on Audio, Speech, and Language Processing}, author={Windmann, Stefan and Haeb-Umbach, Reinhold}, year={2009}, pages={974–984} }","mla":"Windmann, Stefan, and Reinhold Haeb-Umbach. “Approaches to Iterative Speech Feature Enhancement and Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 17, no. 5, 2009, pp. 974–84, doi:<a href=\"https://doi.org/10.1109/TASL.2009.2014894\">10.1109/TASL.2009.2014894</a>.","short":"S. Windmann, R. Haeb-Umbach, IEEE Transactions on Audio, Speech, and Language Processing 17 (2009) 974–984.","apa":"Windmann, S., &#38; Haeb-Umbach, R. (2009). Approaches to Iterative Speech Feature Enhancement and Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, <i>17</i>(5), 974–984. <a href=\"https://doi.org/10.1109/TASL.2009.2014894\">https://doi.org/10.1109/TASL.2009.2014894</a>","chicago":"Windmann, Stefan, and Reinhold Haeb-Umbach. “Approaches to Iterative Speech Feature Enhancement and Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i> 17, no. 5 (2009): 974–84. <a href=\"https://doi.org/10.1109/TASL.2009.2014894\">https://doi.org/10.1109/TASL.2009.2014894</a>.","ieee":"S. Windmann and R. Haeb-Umbach, “Approaches to Iterative Speech Feature Enhancement and Recognition,” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 17, no. 5, pp. 974–984, 2009.","ama":"Windmann S, Haeb-Umbach R. Approaches to Iterative Speech Feature Enhancement and Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>. 2009;17(5):974-984. doi:<a href=\"https://doi.org/10.1109/TASL.2009.2014894\">10.1109/TASL.2009.2014894</a>"},"department":[{"_id":"54"}],"user_id":"44006","_id":"11937","type":"journal_article","status":"public","date_created":"2019-07-12T05:31:08Z","title":"Approaches to Iterative Speech Feature Enhancement and Recognition","issue":"5","year":"2009","language":[{"iso":"eng"}],"keyword":["AURORA2 databases","AURORA4 databases","automatic speech recognition","feedback structures","hidden Markov models","HMM","iterative methods","iterative speech feature enhancement","model probabilities","speech decoding","speech enhancement","speech feature distribution","speech recognition","switching linear dynamic models"],"publication":"IEEE Transactions on Audio, Speech, and Language Processing","abstract":[{"text":"In automatic speech recognition, hidden Markov models (HMMs) are commonly used for speech decoding, while switching linear dynamic models (SLDMs) can be employed for a preceding model-based speech feature enhancement. In this paper, these model types are combined in order to obtain a novel iterative speech feature enhancement and recognition architecture. It is shown that speech feature enhancement with SLDMs can be improved by feeding back information from the HMM to the enhancement stage. Two different feedback structures are derived. In the first, the posteriors of the HMM states are used to control the model probabilities of the SLDMs, while in the second they are employed to directly influence the estimate of the speech feature distribution. Both approaches lead to improvements in recognition accuracy both on the AURORA2 and AURORA4 databases compared to non-iterative speech feature enhancement with SLDMs. It is also shown that a combination with uncertainty decoding further enhances performance.","lang":"eng"}]},{"intvolume":"        17","page":"1577-1590","citation":{"apa":"Windmann, S., &#38; Haeb-Umbach, R. (2009). Parameter Estimation of a State-Space Model of Noise for Robust Speech Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, <i>17</i>(8), 1577–1590. <a href=\"https://doi.org/10.1109/TASL.2009.2023172\">https://doi.org/10.1109/TASL.2009.2023172</a>","mla":"Windmann, Stefan, and Reinhold Haeb-Umbach. “Parameter Estimation of a State-Space Model of Noise for Robust Speech Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 17, no. 8, 2009, pp. 1577–90, doi:<a href=\"https://doi.org/10.1109/TASL.2009.2023172\">10.1109/TASL.2009.2023172</a>.","short":"S. Windmann, R. Haeb-Umbach, IEEE Transactions on Audio, Speech, and Language Processing 17 (2009) 1577–1590.","bibtex":"@article{Windmann_Haeb-Umbach_2009, title={Parameter Estimation of a State-Space Model of Noise for Robust Speech Recognition}, volume={17}, DOI={<a href=\"https://doi.org/10.1109/TASL.2009.2023172\">10.1109/TASL.2009.2023172</a>}, number={8}, journal={IEEE Transactions on Audio, Speech, and Language Processing}, author={Windmann, Stefan and Haeb-Umbach, Reinhold}, year={2009}, pages={1577–1590} }","ama":"Windmann S, Haeb-Umbach R. Parameter Estimation of a State-Space Model of Noise for Robust Speech Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>. 2009;17(8):1577-1590. doi:<a href=\"https://doi.org/10.1109/TASL.2009.2023172\">10.1109/TASL.2009.2023172</a>","ieee":"S. Windmann and R. Haeb-Umbach, “Parameter Estimation of a State-Space Model of Noise for Robust Speech Recognition,” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 17, no. 8, pp. 1577–1590, 2009.","chicago":"Windmann, Stefan, and Reinhold Haeb-Umbach. “Parameter Estimation of a State-Space Model of Noise for Robust Speech Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i> 17, no. 8 (2009): 1577–90. <a href=\"https://doi.org/10.1109/TASL.2009.2023172\">https://doi.org/10.1109/TASL.2009.2023172</a>."},"year":"2009","issue":"8","doi":"10.1109/TASL.2009.2023172","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2009/WiHa09-2.pdf","open_access":"1"}],"title":"Parameter Estimation of a State-Space Model of Noise for Robust Speech Recognition","volume":17,"author":[{"first_name":"Stefan","last_name":"Windmann","full_name":"Windmann, Stefan"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"}],"date_created":"2019-07-12T05:31:09Z","oa":"1","date_updated":"2022-01-06T06:51:12Z","status":"public","abstract":[{"lang":"eng","text":"In this paper, parameter estimation of a state-space model of noise or noisy speech cepstra is investigated. A blockwise EM algorithm is derived for the estimation of the state and observation noise covariance from noise-only input data. It is supposed to be used during the offline training mode of a speech recognizer. Further a sequential online EM algorithm is developed to adapt the observation noise covariance on noisy speech cepstra at its input. The estimated parameters are then used in model-based speech feature enhancement for noise-robust automatic speech recognition. Experiments on the AURORA4 database lead to improved recognition results with a linear state model compared to the assumption of stationary noise."}],"publication":"IEEE Transactions on Audio, Speech, and Language Processing","type":"journal_article","language":[{"iso":"eng"}],"keyword":["AURORA4 database","blockwise EM algorithm","covariance analysis","linear state model","noise covariance","noise-robust automatic speech recognition","noisy speech cepstra","offline training mode","parameter estimation","speech recognition","speech recognition equipment","speech recognizer","state-space methods","state-space model"],"department":[{"_id":"54"}],"user_id":"44006","_id":"11938"},{"keyword":["automatic speech recognition","bit errors","codecs","communication links","corrupted observations","decoding","distributed speech recognition","error-prone communication network","feature vector sequence","hidden Markov model-based ASR","hidden Markov models","inter-frame correlation","Internet telephony","network speech recognition","packet loss","speech posterior","speech recognition","transmission error robust speech recognition","uncertainty decoding","voice-over-IP codecs"],"language":[{"iso":"eng"}],"_id":"11820","department":[{"_id":"54"}],"user_id":"44006","abstract":[{"text":"In this paper, we derive an uncertainty decoding rule for automatic speech recognition (ASR), which accounts for both corrupted observations and inter-frame correlation. The conditional independence assumption, prevalent in hidden Markov model-based ASR, is relaxed to obtain a clean speech posterior that is conditioned on the complete observed feature vector sequence. This is a more informative posterior than one conditioned only on the current observation. The novel decoding is used to obtain a transmission-error robust remote ASR system, where the speech capturing unit is connected to the decoder via an error-prone communication network. We show how the clean speech posterior can be computed for communication links being characterized by either bit errors or packet loss. Recognition results are presented for both distributed and network speech recognition, where in the latter case common voice-over-IP codecs are employed.","lang":"eng"}],"status":"public","publication":"IEEE Transactions on Audio, Speech, and Language Processing","type":"journal_article","title":"A Novel Uncertainty Decoding Rule With Applications to Transmission Error Robust Speech Recognition","doi":"10.1109/TASL.2008.925879","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2008/IoHa08-1.pdf","open_access":"1"}],"oa":"1","date_updated":"2022-01-06T06:51:10Z","volume":16,"date_created":"2019-07-12T05:28:53Z","author":[{"first_name":"Valentin","full_name":"Ion, Valentin","last_name":"Ion"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"year":"2008","page":"1047-1060","intvolume":"        16","citation":{"apa":"Ion, V., &#38; Haeb-Umbach, R. (2008). A Novel Uncertainty Decoding Rule With Applications to Transmission Error Robust Speech Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, <i>16</i>(5), 1047–1060. <a href=\"https://doi.org/10.1109/TASL.2008.925879\">https://doi.org/10.1109/TASL.2008.925879</a>","mla":"Ion, Valentin, and Reinhold Haeb-Umbach. “A Novel Uncertainty Decoding Rule With Applications to Transmission Error Robust Speech Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 16, no. 5, 2008, pp. 1047–60, doi:<a href=\"https://doi.org/10.1109/TASL.2008.925879\">10.1109/TASL.2008.925879</a>.","bibtex":"@article{Ion_Haeb-Umbach_2008, title={A Novel Uncertainty Decoding Rule With Applications to Transmission Error Robust Speech Recognition}, volume={16}, DOI={<a href=\"https://doi.org/10.1109/TASL.2008.925879\">10.1109/TASL.2008.925879</a>}, number={5}, journal={IEEE Transactions on Audio, Speech, and Language Processing}, author={Ion, Valentin and Haeb-Umbach, Reinhold}, year={2008}, pages={1047–1060} }","short":"V. Ion, R. Haeb-Umbach, IEEE Transactions on Audio, Speech, and Language Processing 16 (2008) 1047–1060.","ama":"Ion V, Haeb-Umbach R. A Novel Uncertainty Decoding Rule With Applications to Transmission Error Robust Speech Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>. 2008;16(5):1047-1060. doi:<a href=\"https://doi.org/10.1109/TASL.2008.925879\">10.1109/TASL.2008.925879</a>","chicago":"Ion, Valentin, and Reinhold Haeb-Umbach. “A Novel Uncertainty Decoding Rule With Applications to Transmission Error Robust Speech Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i> 16, no. 5 (2008): 1047–60. <a href=\"https://doi.org/10.1109/TASL.2008.925879\">https://doi.org/10.1109/TASL.2008.925879</a>.","ieee":"V. Ion and R. Haeb-Umbach, “A Novel Uncertainty Decoding Rule With Applications to Transmission Error Robust Speech Recognition,” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 16, no. 5, pp. 1047–1060, 2008."},"issue":"5"},{"_id":"11824","user_id":"44006","department":[{"_id":"54"}],"keyword":["distributed speech recognition","least mean squares methods","MAP estimate","maximum likelihood estimation","MMSE estimate","packet loss compensation scheme","packet switched communication","posteriori probability density function","robust error mitigation method","soft-features","speech recognition","table lookup","voice communication","wireless channels"],"language":[{"iso":"eng"}],"type":"conference","publication":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2006)","abstract":[{"lang":"eng","text":"Soft-feature based speech recognition, which is an example of uncertainty decoding, has been proven to be a robust error mitigation method for distributed speech recognition over wireless channels exhibiting bit errors. In this paper we extend this concept to packet-oriented transmissions. The a posteriori probability density function of the lost feature vector, given the closest received neighbours, is computed. In the experiments, the nearest frame repetition, which is shown to be equivalent to the MAP estimate, outperforms the MMSE estimate for long bursts. Taking the variance into account at the speech recognition stage results in superior performance compared to classical schemes using point estimates. A computationally and memory efficient implementation of the proposed packet loss compensation scheme based on table lookup is presented"}],"status":"public","oa":"1","date_updated":"2022-01-06T06:51:10Z","date_created":"2019-07-12T05:28:58Z","author":[{"first_name":"Valentin","full_name":"Ion, Valentin","last_name":"Ion"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"volume":1,"title":"An Inexpensive Packet Loss Compensation Scheme for Distributed Speech Recognition Based on Soft-Features","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2006/IoHa06-2.pdf","open_access":"1"}],"doi":"10.1109/ICASSP.2006.1659984","year":"2006","citation":{"ama":"Ion V, Haeb-Umbach R. An Inexpensive Packet Loss Compensation Scheme for Distributed Speech Recognition Based on Soft-Features. In: <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2006)</i>. Vol 1. ; 2006:I. doi:<a href=\"https://doi.org/10.1109/ICASSP.2006.1659984\">10.1109/ICASSP.2006.1659984</a>","chicago":"Ion, Valentin, and Reinhold Haeb-Umbach. “An Inexpensive Packet Loss Compensation Scheme for Distributed Speech Recognition Based on Soft-Features.” In <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2006)</i>, 1:I, 2006. <a href=\"https://doi.org/10.1109/ICASSP.2006.1659984\">https://doi.org/10.1109/ICASSP.2006.1659984</a>.","ieee":"V. Ion and R. Haeb-Umbach, “An Inexpensive Packet Loss Compensation Scheme for Distributed Speech Recognition Based on Soft-Features,” in <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2006)</i>, 2006, vol. 1, p. I.","apa":"Ion, V., &#38; Haeb-Umbach, R. (2006). An Inexpensive Packet Loss Compensation Scheme for Distributed Speech Recognition Based on Soft-Features. In <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2006)</i> (Vol. 1, p. I). <a href=\"https://doi.org/10.1109/ICASSP.2006.1659984\">https://doi.org/10.1109/ICASSP.2006.1659984</a>","bibtex":"@inproceedings{Ion_Haeb-Umbach_2006, title={An Inexpensive Packet Loss Compensation Scheme for Distributed Speech Recognition Based on Soft-Features}, volume={1}, DOI={<a href=\"https://doi.org/10.1109/ICASSP.2006.1659984\">10.1109/ICASSP.2006.1659984</a>}, booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2006)}, author={Ion, Valentin and Haeb-Umbach, Reinhold}, year={2006}, pages={I} }","short":"V. Ion, R. Haeb-Umbach, in: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2006), 2006, p. I.","mla":"Ion, Valentin, and Reinhold Haeb-Umbach. “An Inexpensive Packet Loss Compensation Scheme for Distributed Speech Recognition Based on Soft-Features.” <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2006)</i>, vol. 1, 2006, p. I, doi:<a href=\"https://doi.org/10.1109/ICASSP.2006.1659984\">10.1109/ICASSP.2006.1659984</a>."},"page":"I","intvolume":"         1"},{"keyword":["Channel error robustness","Distributed speech recognition","Soft features","Uncertainty decoding"],"language":[{"iso":"eng"}],"publication":"Speech Communication","abstract":[{"text":"In this paper, we propose an enhanced error concealment strategy at the server side of a distributed speech recognition (DSR) system, which is fully compatible with the existing DSR standard. It is based on a Bayesian approach, where the a posteriori probability density of the error-free feature vector is computed, given all received feature vectors which are possibly corrupted by transmission errors. Rather than computing a point estimate, such as the MMSE estimate, and plugging it into the Bayesian decision rule, we employ uncertainty decoding, which results in an integration over the uncertainty in the feature domain. In a typical scenario the communication between the thin client, often a mobile device, and the recognition server spreads across heterogeneous networks. Both bit errors on circuit-switched links and lost data packets on IP connections are mitigated by our approach in a unified manner. The experiments reveal improved robustness both for small- and large-vocabulary recognition tasks.","lang":"eng"}],"date_created":"2019-07-12T05:28:59Z","title":"Uncertainty decoding for distributed speech recognition over error-prone networks","issue":"11","year":"2006","_id":"11825","user_id":"44006","department":[{"_id":"54"}],"type":"journal_article","status":"public","date_updated":"2022-01-06T06:51:10Z","oa":"1","author":[{"first_name":"Valentin","last_name":"Ion","full_name":"Ion, Valentin"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"volume":48,"main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2006/IoHa06-3.pdf","open_access":"1"}],"doi":"10.1016/j.specom.2006.03.007","citation":{"bibtex":"@article{Ion_Haeb-Umbach_2006, title={Uncertainty decoding for distributed speech recognition over error-prone networks}, volume={48}, DOI={<a href=\"https://doi.org/10.1016/j.specom.2006.03.007\">10.1016/j.specom.2006.03.007</a>}, number={11}, journal={Speech Communication}, author={Ion, Valentin and Haeb-Umbach, Reinhold}, year={2006}, pages={1435–1446} }","mla":"Ion, Valentin, and Reinhold Haeb-Umbach. “Uncertainty Decoding for Distributed Speech Recognition over Error-Prone Networks.” <i>Speech Communication</i>, vol. 48, no. 11, 2006, pp. 1435–46, doi:<a href=\"https://doi.org/10.1016/j.specom.2006.03.007\">10.1016/j.specom.2006.03.007</a>.","short":"V. Ion, R. Haeb-Umbach, Speech Communication 48 (2006) 1435–1446.","apa":"Ion, V., &#38; Haeb-Umbach, R. (2006). Uncertainty decoding for distributed speech recognition over error-prone networks. <i>Speech Communication</i>, <i>48</i>(11), 1435–1446. <a href=\"https://doi.org/10.1016/j.specom.2006.03.007\">https://doi.org/10.1016/j.specom.2006.03.007</a>","ieee":"V. Ion and R. Haeb-Umbach, “Uncertainty decoding for distributed speech recognition over error-prone networks,” <i>Speech Communication</i>, vol. 48, no. 11, pp. 1435–1446, 2006.","chicago":"Ion, Valentin, and Reinhold Haeb-Umbach. “Uncertainty Decoding for Distributed Speech Recognition over Error-Prone Networks.” <i>Speech Communication</i> 48, no. 11 (2006): 1435–46. <a href=\"https://doi.org/10.1016/j.specom.2006.03.007\">https://doi.org/10.1016/j.specom.2006.03.007</a>.","ama":"Ion V, Haeb-Umbach R. Uncertainty decoding for distributed speech recognition over error-prone networks. <i>Speech Communication</i>. 2006;48(11):1435-1446. doi:<a href=\"https://doi.org/10.1016/j.specom.2006.03.007\">10.1016/j.specom.2006.03.007</a>"},"intvolume":"        48","page":"1435-1446"},{"publication":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2005)","type":"conference","status":"public","abstract":[{"lang":"eng","text":"In this paper we present a comparison of the recently proposed Soft-Feature Distributed Speech Recognition (SFDSR) with the two evaluated candidate codecs for Speech Enabled Services over wireless networks: Adaptive Multirate Codec (AMR) and the ETSI Extended Advanced Front-End for Distributed Speech Recognition (XAFE). It is shown that SFDSR achieves the best recognition performance on a simulated GSM transmission, followed by XAFE and AMR.We also present some new results concerning SFDSR which demonstrate the versatility of the approach. Further, a simple method is introduced which considerably reduces the computational effort."}],"department":[{"_id":"54"}],"user_id":"44006","_id":"11828","language":[{"iso":"eng"}],"keyword":["adaptive codes","adaptive multirate codec","AMR","distributed speech recognition","ETSI","extended advanced front-end","recognition performance","SFDSR","simulated GSM transmission","soft-feature distributed speech recognition","speech codecs","speech coding","speech recognition","variable rate codes","XAFE"],"page":"333-336","intvolume":"         1","citation":{"apa":"Ion, V., &#38; Haeb-Umbach, R. (2005). A Comparison of Soft-Feature Distributed Speech Recognition with Candidate Codecs for Speech Enabled Mobile Services. In <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2005)</i> (Vol. 1, pp. 333–336). <a href=\"https://doi.org/10.1109/ICASSP.2005.1415118\">https://doi.org/10.1109/ICASSP.2005.1415118</a>","short":"V. Ion, R. Haeb-Umbach, in: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2005), 2005, pp. 333–336.","bibtex":"@inproceedings{Ion_Haeb-Umbach_2005, title={A Comparison of Soft-Feature Distributed Speech Recognition with Candidate Codecs for Speech Enabled Mobile Services}, volume={1}, DOI={<a href=\"https://doi.org/10.1109/ICASSP.2005.1415118\">10.1109/ICASSP.2005.1415118</a>}, booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2005)}, author={Ion, Valentin and Haeb-Umbach, Reinhold}, year={2005}, pages={333–336} }","mla":"Ion, Valentin, and Reinhold Haeb-Umbach. “A Comparison of Soft-Feature Distributed Speech Recognition with Candidate Codecs for Speech Enabled Mobile Services.” <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2005)</i>, vol. 1, 2005, pp. 333–36, doi:<a href=\"https://doi.org/10.1109/ICASSP.2005.1415118\">10.1109/ICASSP.2005.1415118</a>.","ama":"Ion V, Haeb-Umbach R. A Comparison of Soft-Feature Distributed Speech Recognition with Candidate Codecs for Speech Enabled Mobile Services. In: <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2005)</i>. Vol 1. ; 2005:333-336. doi:<a href=\"https://doi.org/10.1109/ICASSP.2005.1415118\">10.1109/ICASSP.2005.1415118</a>","chicago":"Ion, Valentin, and Reinhold Haeb-Umbach. “A Comparison of Soft-Feature Distributed Speech Recognition with Candidate Codecs for Speech Enabled Mobile Services.” In <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2005)</i>, 1:333–36, 2005. <a href=\"https://doi.org/10.1109/ICASSP.2005.1415118\">https://doi.org/10.1109/ICASSP.2005.1415118</a>.","ieee":"V. Ion and R. Haeb-Umbach, “A Comparison of Soft-Feature Distributed Speech Recognition with Candidate Codecs for Speech Enabled Mobile Services,” in <i>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2005)</i>, 2005, vol. 1, pp. 333–336."},"year":"2005","volume":1,"date_created":"2019-07-12T05:29:02Z","author":[{"full_name":"Ion, Valentin","last_name":"Ion","first_name":"Valentin"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"date_updated":"2022-01-06T06:51:10Z","oa":"1","doi":"10.1109/ICASSP.2005.1415118","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2005/IoHa05-2.pdf"}],"title":"A Comparison of Soft-Feature Distributed Speech Recognition with Candidate Codecs for Speech Enabled Mobile Services"},{"keyword":["bimodal human-robot interface","binaural signal processing","enhanced single-channel input signal","filter-and-sum beamforming","filtering theory","FIR filter coefficient","generalized cross correlation method","microphones","microphone signal","nonlinear Bayesian tracking","particle filtering","robust adaptive algorithm","robust speaker direction estimation","signal processing","speech enhancement","speech recognition","speech recognizer","user interfaces"],"language":[{"iso":"eng"}],"_id":"11931","user_id":"44006","department":[{"_id":"54"}],"abstract":[{"text":"The paper is concerned with binaural signal processing for a bimodal human-robot interface with hearing and vision. The two microphone signals are processed to obtain an enhanced single-channel input signal for the subsequent speech recognizer and to localize the acoustic source, an important information for establishing a natural human-robot communication. We utilize a robust adaptive algorithm for filter-and-sum beamforming (FSB) and extract speaker direction information from the resulting FIR filter coefficients. Further, particle filtering is applied which conducts a nonlinear Bayesian tracking of speaker movement. Good location accuracy can be achieved even in highly reverberant environments. The results obtained outperform the conventional generalized cross correlation (GCC) method.","lang":"eng"}],"status":"public","type":"conference","publication":"IEEE Workshop on Multimedia Signal Processing (MMSP 2004)","title":"Robust speaker direction estimation with particle filtering","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2004/WaHa04.pdf","open_access":"1"}],"doi":"10.1109/MMSP.2004.1436569","date_updated":"2022-01-06T06:51:12Z","oa":"1","author":[{"last_name":"Warsitz","full_name":"Warsitz, Ernst","first_name":"Ernst"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"date_created":"2019-07-12T05:31:01Z","year":"2004","citation":{"ama":"Warsitz E, Haeb-Umbach R. Robust speaker direction estimation with particle filtering. In: <i>IEEE Workshop on Multimedia Signal Processing (MMSP 2004)</i>. ; 2004:367-370. doi:<a href=\"https://doi.org/10.1109/MMSP.2004.1436569\">10.1109/MMSP.2004.1436569</a>","ieee":"E. Warsitz and R. Haeb-Umbach, “Robust speaker direction estimation with particle filtering,” in <i>IEEE Workshop on Multimedia Signal Processing (MMSP 2004)</i>, 2004, pp. 367–370.","chicago":"Warsitz, Ernst, and Reinhold Haeb-Umbach. “Robust Speaker Direction Estimation with Particle Filtering.” In <i>IEEE Workshop on Multimedia Signal Processing (MMSP 2004)</i>, 367–70, 2004. <a href=\"https://doi.org/10.1109/MMSP.2004.1436569\">https://doi.org/10.1109/MMSP.2004.1436569</a>.","apa":"Warsitz, E., &#38; Haeb-Umbach, R. (2004). Robust speaker direction estimation with particle filtering. In <i>IEEE Workshop on Multimedia Signal Processing (MMSP 2004)</i> (pp. 367–370). <a href=\"https://doi.org/10.1109/MMSP.2004.1436569\">https://doi.org/10.1109/MMSP.2004.1436569</a>","mla":"Warsitz, Ernst, and Reinhold Haeb-Umbach. “Robust Speaker Direction Estimation with Particle Filtering.” <i>IEEE Workshop on Multimedia Signal Processing (MMSP 2004)</i>, 2004, pp. 367–70, doi:<a href=\"https://doi.org/10.1109/MMSP.2004.1436569\">10.1109/MMSP.2004.1436569</a>.","bibtex":"@inproceedings{Warsitz_Haeb-Umbach_2004, title={Robust speaker direction estimation with particle filtering}, DOI={<a href=\"https://doi.org/10.1109/MMSP.2004.1436569\">10.1109/MMSP.2004.1436569</a>}, booktitle={IEEE Workshop on Multimedia Signal Processing (MMSP 2004)}, author={Warsitz, Ernst and Haeb-Umbach, Reinhold}, year={2004}, pages={367–370} }","short":"E. Warsitz, R. Haeb-Umbach, in: IEEE Workshop on Multimedia Signal Processing (MMSP 2004), 2004, pp. 367–370."},"page":"367-370"},{"_id":"39053","user_id":"5786","department":[{"_id":"672"}],"keyword":["User interfaces","Speech recognition","Streaming media","Specification languages","Keyboards","Speech synthesis","Rendering (computer graphics)","Ambient intelligence","Humans","Displays"],"language":[{"iso":"eng"}],"type":"conference","publication":"Proceedings of HICCS-37","abstract":[{"text":"Portable devices come with different limitations in user interaction like limited display size, small keyboard, and different sorts of input and output capabilities. With the advance of speech recognition and speech synthesis technologies, their complementary use becomes attractive for mobile devices in order to implement real multimodal user interaction. However, current systems and formats do not sufficiently integrate advanced multimodal interactions. We introduce an advanced generic multimodal interaction and rendering system (MIRS) dedicated for mobile devices. MIRS incorporates efficient processing of XML specification languages for limited, mobile devices and comes with the XML-based dialog and interface specification language (DISL). DISL can be considered as an UIML subset, which is enhanced by the means of state-oriented dialog specifications. The dialog specification is based on ODSN (object oriented dialog specification notation), which has been introduced to define user interface control by means of interaction states with transition rules.","lang":"eng"}],"status":"public","date_updated":"2023-01-24T08:46:37Z","author":[{"first_name":"Wolfgang","last_name":"Müller","full_name":"Müller, Wolfgang","id":"16243"},{"full_name":"Schäfer, Robbie","last_name":"Schäfer","first_name":"Robbie"},{"last_name":"Bleul","full_name":"Bleul, Steffen","first_name":"Steffen"}],"date_created":"2023-01-24T08:46:31Z","title":"Interactive Multimodal User Interfaces for Mobile Devices","doi":"10.1109/HICSS.2004.1265674","conference":{"name":"37th Annual Hawaii International Conference on System Sciences","location":"Waikoloa, HI, USA"},"publication_identifier":{"isbn":["0-7695-2056-1"]},"year":"2004","place":"Waikoloa, HI, USA","citation":{"short":"W. Müller, R. Schäfer, S. Bleul, in: Proceedings of HICCS-37, Waikoloa, HI, USA, 2004.","mla":"Müller, Wolfgang, et al. “Interactive Multimodal User Interfaces for Mobile Devices.” <i>Proceedings of HICCS-37</i>, 2004, doi:<a href=\"https://doi.org/10.1109/HICSS.2004.1265674\">10.1109/HICSS.2004.1265674</a>.","bibtex":"@inproceedings{Müller_Schäfer_Bleul_2004, place={Waikoloa, HI, USA}, title={Interactive Multimodal User Interfaces for Mobile Devices}, DOI={<a href=\"https://doi.org/10.1109/HICSS.2004.1265674\">10.1109/HICSS.2004.1265674</a>}, booktitle={Proceedings of HICCS-37}, author={Müller, Wolfgang and Schäfer, Robbie and Bleul, Steffen}, year={2004} }","apa":"Müller, W., Schäfer, R., &#38; Bleul, S. (2004). Interactive Multimodal User Interfaces for Mobile Devices. <i>Proceedings of HICCS-37</i>. 37th Annual Hawaii International Conference on System Sciences, Waikoloa, HI, USA. <a href=\"https://doi.org/10.1109/HICSS.2004.1265674\">https://doi.org/10.1109/HICSS.2004.1265674</a>","ama":"Müller W, Schäfer R, Bleul S. Interactive Multimodal User Interfaces for Mobile Devices. In: <i>Proceedings of HICCS-37</i>. ; 2004. doi:<a href=\"https://doi.org/10.1109/HICSS.2004.1265674\">10.1109/HICSS.2004.1265674</a>","ieee":"W. Müller, R. Schäfer, and S. Bleul, “Interactive Multimodal User Interfaces for Mobile Devices,” presented at the 37th Annual Hawaii International Conference on System Sciences, Waikoloa, HI, USA, 2004, doi: <a href=\"https://doi.org/10.1109/HICSS.2004.1265674\">10.1109/HICSS.2004.1265674</a>.","chicago":"Müller, Wolfgang, Robbie Schäfer, and Steffen Bleul. “Interactive Multimodal User Interfaces for Mobile Devices.” In <i>Proceedings of HICCS-37</i>. Waikoloa, HI, USA, 2004. <a href=\"https://doi.org/10.1109/HICSS.2004.1265674\">https://doi.org/10.1109/HICSS.2004.1265674</a>."}},{"page":"299-302","intvolume":"         9","citation":{"ieee":"R. Haeb-Umbach, “Automatic generation of phonetic regression class trees for MLLR adaptation,” <i>IEEE Transactions on Speech and Audio Processing</i>, vol. 9, no. 3, pp. 299–302, 2001.","chicago":"Haeb-Umbach, Reinhold. “Automatic Generation of Phonetic Regression Class Trees for MLLR Adaptation.” <i>IEEE Transactions on Speech and Audio Processing</i> 9, no. 3 (2001): 299–302. <a href=\"https://doi.org/10.1109/89.906003\">https://doi.org/10.1109/89.906003</a>.","ama":"Haeb-Umbach R. Automatic generation of phonetic regression class trees for MLLR adaptation. <i>IEEE Transactions on Speech and Audio Processing</i>. 2001;9(3):299-302. doi:<a href=\"https://doi.org/10.1109/89.906003\">10.1109/89.906003</a>","apa":"Haeb-Umbach, R. (2001). Automatic generation of phonetic regression class trees for MLLR adaptation. <i>IEEE Transactions on Speech and Audio Processing</i>, <i>9</i>(3), 299–302. <a href=\"https://doi.org/10.1109/89.906003\">https://doi.org/10.1109/89.906003</a>","mla":"Haeb-Umbach, Reinhold. “Automatic Generation of Phonetic Regression Class Trees for MLLR Adaptation.” <i>IEEE Transactions on Speech and Audio Processing</i>, vol. 9, no. 3, 2001, pp. 299–302, doi:<a href=\"https://doi.org/10.1109/89.906003\">10.1109/89.906003</a>.","short":"R. Haeb-Umbach, IEEE Transactions on Speech and Audio Processing 9 (2001) 299–302.","bibtex":"@article{Haeb-Umbach_2001, title={Automatic generation of phonetic regression class trees for MLLR adaptation}, volume={9}, DOI={<a href=\"https://doi.org/10.1109/89.906003\">10.1109/89.906003</a>}, number={3}, journal={IEEE Transactions on Speech and Audio Processing}, author={Haeb-Umbach, Reinhold}, year={2001}, pages={299–302} }"},"year":"2001","issue":"3","doi":"10.1109/89.906003","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2001/Ha01.pdf","open_access":"1"}],"title":"Automatic generation of phonetic regression class trees for MLLR adaptation","volume":9,"date_created":"2019-07-12T05:28:04Z","author":[{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"oa":"1","date_updated":"2022-01-06T06:51:08Z","status":"public","abstract":[{"text":"In this paper, it is shown that a correlation criterion is the appropriate criterion for bottom-up clustering to obtain broad phonetic class regression trees for maximum likelihood linear regression (MLLR)-based speaker adaptation. The correlation structure among speech units is estimated on the speaker-independent training data. In adaptation experiments the tree outperformed a regression tree obtained from clustering according to closeness in acoustic space and achieved results comparable with those of a manually designed broad phonetic class tree","lang":"eng"}],"publication":"IEEE Transactions on Speech and Audio Processing","type":"journal_article","language":[{"iso":"eng"}],"keyword":["acoustic space","adaptation experiments","automatic generation","bottom-up clustering","broad phonetic class regression trees","correlation criterion","correlation methods","maximum likelihood estimation","maximum likelihood linear regression based speaker adaptation","MLLR adaptation","pattern clustering","phonetic regression class trees","speaker-independent training data","speech recognition","speech units","statistical analysis","trees (mathematics)"],"department":[{"_id":"54"}],"user_id":"44006","_id":"11778"},{"citation":{"short":"M. Lieb, R. Haeb-Umbach, in: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2000), 2000, pp. II1105-II1108 vol.2.","mla":"Lieb, M., and Reinhold Haeb-Umbach. “LDA Derived Cepstral Trajectory Filters in Adverse Environmental Conditions.” <i>IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2000)</i>, vol. 2, 2000, pp. II1105-II1108 vol.2, doi:<a href=\"https://doi.org/10.1109/ICASSP.2000.859157\">10.1109/ICASSP.2000.859157</a>.","bibtex":"@inproceedings{Lieb_Haeb-Umbach_2000, title={LDA derived cepstral trajectory filters in adverse environmental conditions}, volume={2}, DOI={<a href=\"https://doi.org/10.1109/ICASSP.2000.859157\">10.1109/ICASSP.2000.859157</a>}, booktitle={IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2000)}, author={Lieb, M. and Haeb-Umbach, Reinhold}, year={2000}, pages={II1105-II1108 vol.2} }","apa":"Lieb, M., &#38; Haeb-Umbach, R. (2000). LDA derived cepstral trajectory filters in adverse environmental conditions. In <i>IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2000)</i> (Vol. 2, pp. II1105-II1108 vol.2). <a href=\"https://doi.org/10.1109/ICASSP.2000.859157\">https://doi.org/10.1109/ICASSP.2000.859157</a>","ieee":"M. Lieb and R. Haeb-Umbach, “LDA derived cepstral trajectory filters in adverse environmental conditions,” in <i>IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2000)</i>, 2000, vol. 2, pp. II1105-II1108 vol.2.","chicago":"Lieb, M., and Reinhold Haeb-Umbach. “LDA Derived Cepstral Trajectory Filters in Adverse Environmental Conditions.” In <i>IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2000)</i>, 2:II1105-II1108 vol.2, 2000. <a href=\"https://doi.org/10.1109/ICASSP.2000.859157\">https://doi.org/10.1109/ICASSP.2000.859157</a>.","ama":"Lieb M, Haeb-Umbach R. LDA derived cepstral trajectory filters in adverse environmental conditions. In: <i>IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2000)</i>. Vol 2. ; 2000:II1105-II1108 vol.2. doi:<a href=\"https://doi.org/10.1109/ICASSP.2000.859157\">10.1109/ICASSP.2000.859157</a>"},"intvolume":"         2","page":"II1105-II1108 vol.2","year":"2000","date_created":"2019-07-12T05:29:50Z","author":[{"last_name":"Lieb","full_name":"Lieb, M.","first_name":"M."},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"volume":2,"oa":"1","date_updated":"2022-01-06T06:51:11Z","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2000/LiHa00.pdf","open_access":"1"}],"doi":"10.1109/ICASSP.2000.859157","title":"LDA derived cepstral trajectory filters in adverse environmental conditions","type":"conference","publication":"IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2000)","status":"public","abstract":[{"text":"Amongst several data driven approaches for designing filters for the time sequence of spectral parameters, the linear discriminant analysis (LDA) based method has been proposed for automatic speech recognition. Here we apply LDA-based filter design to cepstral features, which better match the inherent assumption of this method that feature vector components are uncorrelated. Extensive recognition experiments have been conducted both on the standard TIMIT phone recognition task and on a proprietary 130-words command word task under various adverse environmental conditions, including reverberant data with real-life room impulse responses and data processed by acoustic echo cancellation algorithms. Significant error rate reductions have been achieved when applying the novel long-range feature filters compared to standard approaches employing cepstral mean normalization and delta and delta-delta features, in particular when facing acoustic echo cancellation scenarios and room reverberation. For example, the phone accuracy on reverberated TIMIT data could be increased from 50.7\\% to 56.0\\%","lang":"eng"}],"user_id":"44006","department":[{"_id":"54"}],"_id":"11869","language":[{"iso":"eng"}],"keyword":["acoustic echo cancellation algorithms","adverse environmental conditions","automatic speech recognition","cepstral analysis","cepstral features","cepstral mean normalization","command word task","delta-delta features","delta features","echo suppression","error rate reductions","feature vector components","FIR filters","LDA derived cepstral trajectory filters","linear discriminant analysis","long-range feature filters","phone accuracy","real-life room impulse responses","reverberant data","spectral parameters","speech recognition","standard TIMIT phone recognition task"]}]
