[{"article_type":"original","file_date_updated":"2023-01-11T08:50:19Z","_id":"35602","project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"department":[{"_id":"54"}],"user_id":"49870","status":"public","type":"journal_article","doi":"10.1109/taslp.2022.3228629","oa":"1","date_updated":"2023-11-15T12:16:11Z","volume":31,"author":[{"first_name":"Thilo","orcid":"https://orcid.org/0000-0002-7717-8670","last_name":"von Neumann","id":"49870","full_name":"von Neumann, Thilo"},{"first_name":"Keisuke","last_name":"Kinoshita","full_name":"Kinoshita, Keisuke"},{"id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker","first_name":"Christoph"},{"full_name":"Delcroix, Marc","last_name":"Delcroix","first_name":"Marc"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"intvolume":"        31","page":"576-589","citation":{"ama":"von Neumann T, Kinoshita K, Boeddeker C, Delcroix M, Haeb-Umbach R. Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>. 2023;31:576-589. doi:<a href=\"https://doi.org/10.1109/taslp.2022.3228629\">10.1109/taslp.2022.3228629</a>","ieee":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, and R. Haeb-Umbach, “Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria,” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, vol. 31, pp. 576–589, 2023, doi: <a href=\"https://doi.org/10.1109/taslp.2022.3228629\">10.1109/taslp.2022.3228629</a>.","chicago":"Neumann, Thilo von, Keisuke Kinoshita, Christoph Boeddeker, Marc Delcroix, and Reinhold Haeb-Umbach. “Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i> 31 (2023): 576–89. <a href=\"https://doi.org/10.1109/taslp.2022.3228629\">https://doi.org/10.1109/taslp.2022.3228629</a>.","short":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, R. Haeb-Umbach, IEEE/ACM Transactions on Audio, Speech, and Language Processing 31 (2023) 576–589.","mla":"von Neumann, Thilo, et al. “Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, vol. 31, Institute of Electrical and Electronics Engineers (IEEE), 2023, pp. 576–89, doi:<a href=\"https://doi.org/10.1109/taslp.2022.3228629\">10.1109/taslp.2022.3228629</a>.","bibtex":"@article{von Neumann_Kinoshita_Boeddeker_Delcroix_Haeb-Umbach_2023, title={Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria}, volume={31}, DOI={<a href=\"https://doi.org/10.1109/taslp.2022.3228629\">10.1109/taslp.2022.3228629</a>}, journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2023}, pages={576–589} }","apa":"von Neumann, T., Kinoshita, K., Boeddeker, C., Delcroix, M., &#38; Haeb-Umbach, R. (2023). Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, <i>31</i>, 576–589. <a href=\"https://doi.org/10.1109/taslp.2022.3228629\">https://doi.org/10.1109/taslp.2022.3228629</a>"},"publication_identifier":{"issn":["2329-9290","2329-9304"]},"has_accepted_license":"1","publication_status":"published","keyword":["Continuous Speech Separation","Source Separation","Graph-PIT","Dynamic Programming","Permutation Invariant Training"],"ddc":["000"],"language":[{"iso":"eng"}],"abstract":[{"text":"Continuous Speech Separation (CSS) has been proposed to address speech overlaps during the analysis of realistic meeting-like conversations by eliminating any overlaps before further processing.\r\nCSS separates a recording of arbitrarily many speakers into a small number of overlap-free output channels, where each output channel may contain speech of multiple speakers.\r\nThis is often done by applying a conventional separation model trained with Utterance-level Permutation Invariant Training (uPIT), which exclusively maps a speaker to an output channel, in sliding window approach called stitching.\r\nRecently, we introduced an alternative training scheme called Graph-PIT that teaches the separation network to directly produce output streams in the required format without stitching.\r\nIt can handle an arbitrary number of speakers as long as never more of them overlap at the same time than the separator has output channels.\r\nIn this contribution, we further investigate the Graph-PIT training scheme.\r\nWe show in extended experiments that models trained with Graph-PIT also work in challenging reverberant conditions.\r\nModels trained in this way are able to perform segment-less CSS, i.e., without stitching, and achieve comparable and often better separation quality than the conventional CSS with uPIT and stitching.\r\nWe simplify the training schedule for Graph-PIT with the recently proposed Source Aggregated Signal-to-Distortion Ratio (SA-SDR) loss.\r\nIt eliminates unfavorable properties of the previously used A-SDR loss and thus enables training with Graph-PIT from scratch.\r\nGraph-PIT training relaxes the constraints w.r.t. the allowed numbers of speakers and speaking patterns which allows using a larger variety of training data.\r\nFurthermore, we introduce novel signal-level evaluation metrics for meeting scenarios, namely the source-aggregated scale- and convolution-invariant Signal-to-Distortion Ratio (SA-SI-SDR and SA-CI-SDR), which are generalizations of the commonly used SDR-based metrics for the CSS case.","lang":"eng"}],"file":[{"content_type":"application/pdf","relation":"main_file","date_created":"2023-01-09T17:46:05Z","creator":"haebumb","date_updated":"2023-01-11T08:50:19Z","file_id":"35607","access_level":"open_access","file_name":"main.pdf","file_size":7185077}],"publication":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","title":"Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","date_created":"2023-01-09T17:24:17Z","year":"2023","quality_controlled":"1"},{"status":"public","type":"conference","file_date_updated":"2023-10-19T07:19:59Z","project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"},{"grant_number":"448568305","_id":"508","name":"Automatische Transkription von Gesprächssituationen"}],"_id":"48275","user_id":"40767","department":[{"_id":"54"}],"citation":{"ama":"von Neumann T, Boeddeker C, Delcroix M, Haeb-Umbach R. MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems. In: <i>Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments</i>. ; 2023.","ieee":"T. von Neumann, C. Boeddeker, M. Delcroix, and R. Haeb-Umbach, “MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems,” presented at the CHiME 2023 Workshop on Speech Processing in Everyday Environments, Dublin, 2023.","chicago":"Neumann, Thilo von, Christoph Boeddeker, Marc Delcroix, and Reinhold Haeb-Umbach. “MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems.” In <i>Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments</i>, 2023.","apa":"von Neumann, T., Boeddeker, C., Delcroix, M., &#38; Haeb-Umbach, R. (2023). MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems. <i>Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments</i>. CHiME 2023 Workshop on Speech Processing in Everyday Environments, Dublin.","mla":"von Neumann, Thilo, et al. “MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems.” <i>Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments</i>, 2023.","bibtex":"@inproceedings{von Neumann_Boeddeker_Delcroix_Haeb-Umbach_2023, title={MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems}, booktitle={Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments}, author={von Neumann, Thilo and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2023} }","short":"T. von Neumann, C. Boeddeker, M. Delcroix, R. Haeb-Umbach, in: Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments, 2023."},"has_accepted_license":"1","related_material":{"link":[{"relation":"software","url":"https://github.com/fgnt/meeteval"}]},"main_file_link":[{"url":"https://arxiv.org/abs/2307.11394","open_access":"1"}],"conference":{"name":"CHiME 2023 Workshop on Speech Processing in Everyday Environments","location":"Dublin"},"oa":"1","date_updated":"2025-02-12T09:12:05Z","author":[{"full_name":"von Neumann, Thilo","id":"49870","orcid":"https://orcid.org/0000-0002-7717-8670","last_name":"von Neumann","first_name":"Thilo"},{"last_name":"Boeddeker","full_name":"Boeddeker, Christoph","id":"40767","first_name":"Christoph"},{"first_name":"Marc","last_name":"Delcroix","full_name":"Delcroix, Marc"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"abstract":[{"lang":"eng","text":"MeetEval is an open-source toolkit to evaluate  all kinds of meeting transcription systems.\r\nIt provides a unified interface for the computation of commonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER along other WER definitions.\r\nWe extend the cpWER computation by a temporal constraint to ensure that only words are identified as correct when the temporal alignment is plausible.\r\nThis leads to a better quality of the matching of the hypothesis string to the reference string that more closely resembles the actual transcription quality, and a system is penalized if it provides poor time annotations.\r\nSince word-level timing information is often not available, we present a way to approximate exact word-level timings from segment-level timings (e.g., a sentence) and show that the approximation leads to a similar WER as a matching with exact word-level annotations.\r\nAt the same time, the time constraint leads to a speedup of the matching algorithm, which outweighs the additional overhead caused by processing the time stamps."}],"file":[{"file_id":"48276","access_level":"open_access","file_name":"Chime_7__MeetEval.pdf","file_size":263744,"creator":"tvn","date_created":"2023-10-19T07:19:59Z","date_updated":"2023-10-19T07:19:59Z","relation":"main_file","content_type":"application/pdf"}],"publication":"Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments","ddc":["000"],"keyword":["Speech Recognition","Word Error Rate","Meeting Transcription"],"language":[{"iso":"eng"}],"year":"2023","quality_controlled":"1","title":"MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems","date_created":"2023-10-19T07:24:51Z"},{"quality_controlled":"1","year":"2021","date_created":"2021-10-25T08:50:01Z","title":"Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers","publication":"Interspeech 2021","abstract":[{"lang":"eng","text":"Automatic transcription of meetings requires handling of overlapped speech, which calls for continuous speech separation (CSS) systems. The uPIT criterion was proposed for utterance-level separation with neural networks and introduces the constraint that the total number of speakers must not exceed the number of output channels. When processing meeting-like data in a segment-wise manner, i.e., by separating overlapping segments independently and stitching adjacent segments to continuous output streams, this constraint has to be fulfilled for any segment. In this contribution, we show that this constraint can be significantly relaxed. We propose a novel graph-based PIT criterion, which casts the assignment of utterances to output channels in a graph coloring problem. It only requires that the number of concurrently active speakers must not exceed the number of output channels. As a consequence, the system can process an arbitrary number of speakers and arbitrarily long segments and thus can handle more diverse scenarios.\r\nFurther, the stitching algorithm for obtaining a consistent output order in neighboring segments is of less importance and can even be eliminated completely, not the least reducing the computational effort. Experiments on meeting-style WSJ data show improvements in recognition performance over using the uPIT criterion. "}],"file":[{"relation":"supplementary_material","content_type":"video/mp4","file_size":9550220,"title":"Video for INTERSPEECH 2021","file_id":"28327","access_level":"open_access","file_name":"Interspeech 2021 voiceover-002-compressed.mp4","date_updated":"2021-12-06T10:48:30Z","date_created":"2021-12-06T10:39:13Z","creator":"tvn"},{"content_type":"application/vnd.openxmlformats-officedocument.presentationml.presentation","relation":"slides","date_updated":"2021-12-06T10:47:01Z","creator":"tvn","date_created":"2021-12-06T10:47:01Z","title":"Slides from INTERSPEECH 2021","file_size":1337297,"file_id":"28328","access_level":"open_access","file_name":"Graph-PIT-poster-presentation.pptx"},{"relation":"main_file","content_type":"application/pdf","file_size":226589,"file_id":"28329","file_name":"INTERSPEECH2021_Graph_PIT.pdf","access_level":"open_access","date_updated":"2021-12-06T10:48:21Z","date_created":"2021-12-06T10:48:21Z","creator":"tvn"}],"keyword":["Continuous speech separation","automatic speech recognition","overlapped speech","permutation invariant training"],"ddc":["000"],"language":[{"iso":"eng"}],"has_accepted_license":"1","publication_status":"published","related_material":{"link":[{"url":"https://github.com/fgnt/graph_pit","relation":"software"}]},"citation":{"apa":"von Neumann, T., Kinoshita, K., Boeddeker, C., Delcroix, M., &#38; Haeb-Umbach, R. (2021). Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers. <i>Interspeech 2021</i>. Interspeech. <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">https://doi.org/10.21437/interspeech.2021-1177</a>","short":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, R. Haeb-Umbach, in: Interspeech 2021, 2021.","bibtex":"@inproceedings{von Neumann_Kinoshita_Boeddeker_Delcroix_Haeb-Umbach_2021, title={Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers}, DOI={<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>}, booktitle={Interspeech 2021}, author={von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}, year={2021} }","mla":"von Neumann, Thilo, et al. “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers.” <i>Interspeech 2021</i>, 2021, doi:<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>.","ama":"von Neumann T, Kinoshita K, Boeddeker C, Delcroix M, Haeb-Umbach R. Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers. In: <i>Interspeech 2021</i>. ; 2021. doi:<a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>","chicago":"Neumann, Thilo von, Keisuke Kinoshita, Christoph Boeddeker, Marc Delcroix, and Reinhold Haeb-Umbach. “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers.” In <i>Interspeech 2021</i>, 2021. <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">https://doi.org/10.21437/interspeech.2021-1177</a>.","ieee":"T. von Neumann, K. Kinoshita, C. Boeddeker, M. Delcroix, and R. Haeb-Umbach, “Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers,” presented at the Interspeech, 2021, doi: <a href=\"https://doi.org/10.21437/interspeech.2021-1177\">10.21437/interspeech.2021-1177</a>."},"oa":"1","date_updated":"2023-11-15T12:14:40Z","author":[{"full_name":"von Neumann, Thilo","id":"49870","orcid":"https://orcid.org/0000-0002-7717-8670","last_name":"von Neumann","first_name":"Thilo"},{"first_name":"Keisuke","last_name":"Kinoshita","full_name":"Kinoshita, Keisuke"},{"full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker","first_name":"Christoph"},{"last_name":"Delcroix","full_name":"Delcroix, Marc","first_name":"Marc"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"conference":{"name":"Interspeech"},"doi":"10.21437/interspeech.2021-1177","type":"conference","status":"public","_id":"26770","project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"49870","file_date_updated":"2021-12-06T10:48:30Z"},{"status":"public","abstract":[{"lang":"eng","text":"Repetitive TMS (rTMS) with a frequency of 5-10~Hz is widely used for language mapping. However, it may be accompanied by discomfort and is limited in the number and reliability of evoked language errors. We, here, systematically tested the influence of different stimulation frequencies (i.e., 10, 30, and 50 Hz) on tolerability, number, reliability, and cortical distribution of language errors aiming at improved language mapping. 15 right-handed, healthy subjects (m~=~8, median age: 29 yrs) were investigated in two sessions, separated by 2-5 days. In each session, 10, 30, and 50 Hz rTMS were applied over the left hemisphere in a randomized order during a picture naming task. Overall, 30 Hz rTMS evoked significantly more errors (20 $\\pm$ 12{%}) compared to 50 Hz (12 $\\pm$ 8{%}; p {\\textless}.01), whereas error rates were comparable between 30/50 and 10~Hz (18 $\\pm$ 11{%}). Across all conditions, a significantly higher error rate was found in Session 1 (19 $\\pm$ 13{%}) compared to Session 2 (13 $\\pm$ 7{%}, p {\\textless}.05). The error rate was poorly reliable between sessions for 10 (intraclass correlation coefficient, ICC~=~.315) and 30 Hz (ICC~=~.427), whereas 50 Hz showed a moderate reliability (ICC~=~.597). Spatial reliability of language errors was low to moderate with a tendency toward increased reliability for higher frequencies, for example, within frontal regions. Compared to 10~Hz, both, 30 and 50 Hz were rated as less painful. Taken together, our data favor the use of rTMS-protocols employing higher frequencies for evoking language errors reliably and with reduced discomfort, depending on the region of interest."}],"type":"journal_article","publication":"Human brain mapping","language":[{"iso":"eng"}],"extern":"1","keyword":["Adult","Brain Mapping","Cerebral Cortex/diagnostic imaging/physiology","Female","Humans","Magnetic Resonance Imaging","Male","Pattern Recognition","Psycholinguistics","Reproducibility of Results","Speech/physiology","Transcranial Magnetic Stimulation","Visual/physiology","Young Adult"],"user_id":"61071","_id":"57971","citation":{"ama":"Nettekoven C, Pieczewski J, Neuschmelting V, et al. Improving the efficacy and reliability of rTMS language mapping by increasing the stimulation frequency. <i>Human brain mapping</i>. 2021;42(16):5309–5321. doi:<a href=\"https://doi.org/10.1002/hbm.25619\">10.1002/hbm.25619</a>","chicago":"Nettekoven, Charlotte, Julia Pieczewski, Volker Neuschmelting, Kristina Jonas, Roland Goldbrunner, Christian Grefkes, and Carolin Weiss Lucas. “Improving the Efficacy and Reliability of RTMS Language Mapping by Increasing the Stimulation Frequency.” <i>Human Brain Mapping</i> 42, no. 16 (2021): 5309–5321. <a href=\"https://doi.org/10.1002/hbm.25619\">https://doi.org/10.1002/hbm.25619</a>.","ieee":"C. Nettekoven <i>et al.</i>, “Improving the efficacy and reliability of rTMS language mapping by increasing the stimulation frequency,” <i>Human brain mapping</i>, vol. 42, no. 16, pp. 5309–5321, 2021, doi: <a href=\"https://doi.org/10.1002/hbm.25619\">10.1002/hbm.25619</a>.","bibtex":"@article{Nettekoven_Pieczewski_Neuschmelting_Jonas_Goldbrunner_Grefkes_Weiss Lucas_2021, title={Improving the efficacy and reliability of rTMS language mapping by increasing the stimulation frequency}, volume={42}, DOI={<a href=\"https://doi.org/10.1002/hbm.25619\">10.1002/hbm.25619</a>}, number={16}, journal={Human brain mapping}, author={Nettekoven, Charlotte and Pieczewski, Julia and Neuschmelting, Volker and Jonas, Kristina and Goldbrunner, Roland and Grefkes, Christian and Weiss Lucas, Carolin}, year={2021}, pages={5309–5321} }","mla":"Nettekoven, Charlotte, et al. “Improving the Efficacy and Reliability of RTMS Language Mapping by Increasing the Stimulation Frequency.” <i>Human Brain Mapping</i>, vol. 42, no. 16, 2021, pp. 5309–5321, doi:<a href=\"https://doi.org/10.1002/hbm.25619\">10.1002/hbm.25619</a>.","short":"C. Nettekoven, J. Pieczewski, V. Neuschmelting, K. Jonas, R. Goldbrunner, C. Grefkes, C. Weiss Lucas, Human Brain Mapping 42 (2021) 5309–5321.","apa":"Nettekoven, C., Pieczewski, J., Neuschmelting, V., Jonas, K., Goldbrunner, R., Grefkes, C., &#38; Weiss Lucas, C. (2021). Improving the efficacy and reliability of rTMS language mapping by increasing the stimulation frequency. <i>Human Brain Mapping</i>, <i>42</i>(16), 5309–5321. <a href=\"https://doi.org/10.1002/hbm.25619\">https://doi.org/10.1002/hbm.25619</a>"},"page":"5309–5321","intvolume":"        42","year":"2021","issue":"16","doi":"10.1002/hbm.25619","title":"Improving the efficacy and reliability of rTMS language mapping by increasing the stimulation frequency","date_created":"2025-01-06T12:11:43Z","author":[{"first_name":"Charlotte","last_name":"Nettekoven","full_name":"Nettekoven, Charlotte"},{"full_name":"Pieczewski, Julia","last_name":"Pieczewski","first_name":"Julia"},{"full_name":"Neuschmelting, Volker","last_name":"Neuschmelting","first_name":"Volker"},{"full_name":"Jonas, Kristina","id":"94540","last_name":"Jonas","orcid":"0000-0002-1067-9139","first_name":"Kristina"},{"first_name":"Roland","full_name":"Goldbrunner, Roland","last_name":"Goldbrunner"},{"full_name":"Grefkes, Christian","last_name":"Grefkes","first_name":"Christian"},{"first_name":"Carolin","last_name":"Weiss Lucas","full_name":"Weiss Lucas, Carolin"}],"volume":42,"date_updated":"2026-04-13T11:37:55Z"},{"author":[{"first_name":"Jens","full_name":"Heitkaemper, Jens","id":"27643","last_name":"Heitkaemper"},{"first_name":"Darius","full_name":"Jakobeit, Darius","last_name":"Jakobeit"},{"full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker","first_name":"Christoph"},{"full_name":"Drude, Lukas","last_name":"Drude","first_name":"Lukas"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_updated":"2022-01-13T08:47:32Z","citation":{"mla":"Heitkaemper, Jens, et al. “Demystifying TasNet: A Dissecting Approach.” <i>ICASSP 2020 Virtual Barcelona Spain</i>, 2020.","bibtex":"@inproceedings{Heitkaemper_Jakobeit_Boeddeker_Drude_Haeb-Umbach_2020, title={Demystifying TasNet: A Dissecting Approach}, booktitle={ICASSP 2020 Virtual Barcelona Spain}, author={Heitkaemper, Jens and Jakobeit, Darius and Boeddeker, Christoph and Drude, Lukas and Haeb-Umbach, Reinhold}, year={2020} }","short":"J. Heitkaemper, D. Jakobeit, C. Boeddeker, L. Drude, R. Haeb-Umbach, in: ICASSP 2020 Virtual Barcelona Spain, 2020.","apa":"Heitkaemper, J., Jakobeit, D., Boeddeker, C., Drude, L., &#38; Haeb-Umbach, R. (2020). Demystifying TasNet: A Dissecting Approach. <i>ICASSP 2020 Virtual Barcelona Spain</i>.","ieee":"J. Heitkaemper, D. Jakobeit, C. Boeddeker, L. Drude, and R. Haeb-Umbach, “Demystifying TasNet: A Dissecting Approach,” 2020.","chicago":"Heitkaemper, Jens, Darius Jakobeit, Christoph Boeddeker, Lukas Drude, and Reinhold Haeb-Umbach. “Demystifying TasNet: A Dissecting Approach.” In <i>ICASSP 2020 Virtual Barcelona Spain</i>, 2020.","ama":"Heitkaemper J, Jakobeit D, Boeddeker C, Drude L, Haeb-Umbach R. Demystifying TasNet: A Dissecting Approach. In: <i>ICASSP 2020 Virtual Barcelona Spain</i>. ; 2020."},"has_accepted_license":"1","file_date_updated":"2020-12-11T12:36:37Z","user_id":"40767","department":[{"_id":"54"}],"project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"20504","status":"public","type":"conference","title":"Demystifying TasNet: A Dissecting Approach","date_created":"2020-11-25T14:56:53Z","year":"2020","quality_controlled":"1","language":[{"iso":"eng"}],"ddc":["000"],"keyword":["voice activity detection","speech activity detection","neural network","statistical speech processing"],"file":[{"file_id":"20699","file_name":"ms.pdf","access_level":"closed","file_size":3871374,"creator":"jensheit","date_created":"2020-12-11T12:36:37Z","date_updated":"2020-12-11T12:36:37Z","relation":"main_file","success":1,"content_type":"application/pdf"}],"license":"https://creativecommons.org/publicdomain/zero/1.0/","abstract":[{"lang":"eng","text":"In recent years time domain speech separation has excelled over frequency domain separation in single channel scenarios and noise-free environments. In this paper we dissect the gains of the time-domain audio separation network (TasNet) approach by gradually replacing components of an utterance-level permutation invariant training (u-PIT) based separation system in the frequency domain until the TasNet system is reached, thus blending components of frequency domain approaches with those of time domain approaches. Some of the intermediate variants achieve comparable signal-to-distortion ratio (SDR) gains to TasNet, but retain the advantage of frequency domain processing: compatibility with classic signal processing tools such as frequency-domain beamforming and the human interpretability of the masks. Furthermore, we show that the scale invariant signal-to-distortion ratio (si-SDR) criterion used as loss function in TasNet is related to a logarithmic mean square error criterion and that it is this criterion which contributes most reliable to the performance advantage of TasNet. Finally, we critically assess which gains in a noise-free single channel environment generalize to more realistic reverberant conditions."}],"publication":"ICASSP 2020 Virtual Barcelona Spain"},{"type":"conference","publication":"INTERSPEECH 2020 Virtual Shanghai China","file":[{"success":1,"relation":"main_file","content_type":"application/pdf","file_size":998706,"file_id":"20697","file_name":"ms.pdf","access_level":"closed","date_updated":"2020-12-11T12:33:04Z","date_created":"2020-12-11T12:33:04Z","creator":"jensheit"}],"status":"public","abstract":[{"lang":"eng","text":"Speech activity detection (SAD), which often rests on the fact that the noise is \"more'' stationary than speech, is particularly challenging in non-stationary environments, because the time variance of the acoustic scene makes it difficult to discriminate  speech from noise. We propose two approaches to SAD, where one is based on statistical signal processing, while the other utilizes neural networks. The former employs sophisticated signal processing to track the noise and speech energies and is meant to support the case for a resource efficient, unsupervised signal processing approach.\r\nThe latter introduces a recurrent network layer that operates on short segments of the input speech to do temporal smoothing in the presence of non-stationary noise. The systems are tested on the Fearless Steps challenge database, which consists of the transmission data from the Apollo-11 space mission.\r\nThe statistical SAD  achieves comparable detection performance to earlier proposed neural network based SADs, while the neural network based approach leads to a decision cost function of 1.07% on the evaluation set of the 2020 Fearless Steps Challenge, which sets a new state of the art."}],"user_id":"460","department":[{"_id":"54"}],"project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"20505","file_date_updated":"2020-12-11T12:33:04Z","language":[{"iso":"eng"}],"ddc":["000"],"keyword":["voice activity detection","speech activity detection","neural network","statistical speech processing"],"has_accepted_license":"1","citation":{"ama":"Heitkaemper J, Schmalenstroeer J, Haeb-Umbach R. Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments. In: <i>INTERSPEECH 2020 Virtual Shanghai China</i>. ; 2020.","chicago":"Heitkaemper, Jens, Joerg Schmalenstroeer, and Reinhold Haeb-Umbach. “Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments.” In <i>INTERSPEECH 2020 Virtual Shanghai China</i>, 2020.","ieee":"J. Heitkaemper, J. Schmalenstroeer, and R. Haeb-Umbach, “Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments,” 2020.","short":"J. Heitkaemper, J. Schmalenstroeer, R. Haeb-Umbach, in: INTERSPEECH 2020 Virtual Shanghai China, 2020.","bibtex":"@inproceedings{Heitkaemper_Schmalenstroeer_Haeb-Umbach_2020, title={Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments}, booktitle={INTERSPEECH 2020 Virtual Shanghai China}, author={Heitkaemper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}, year={2020} }","mla":"Heitkaemper, Jens, et al. “Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments.” <i>INTERSPEECH 2020 Virtual Shanghai China</i>, 2020.","apa":"Heitkaemper, J., Schmalenstroeer, J., &#38; Haeb-Umbach, R. (2020). Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments. <i>INTERSPEECH 2020 Virtual Shanghai China</i>."},"year":"2020","author":[{"first_name":"Jens","full_name":"Heitkaemper, Jens","id":"27643","last_name":"Heitkaemper"},{"last_name":"Schmalenstroeer","id":"460","full_name":"Schmalenstroeer, Joerg","first_name":"Joerg"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_created":"2020-11-25T15:03:19Z","date_updated":"2023-10-26T08:28:49Z","title":"Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments"},{"citation":{"bibtex":"@inproceedings{Abramov_Kopp_Nemeth_Kern_Mertens_Rohlfing_2018, title={Towards a Computational Model of Child Gesture-Speech Production}, booktitle={KOGWIS2018: Computational Approaches to Cognitive Science}, author={Abramov, Olga and Kopp, Stefan and Nemeth, Anne and Kern, Friederike and Mertens, Ulrich and Rohlfing, Katharina}, year={2018} }","short":"O. Abramov, S. Kopp, A. Nemeth, F. Kern, U. Mertens, K. Rohlfing, in: KOGWIS2018: Computational Approaches to Cognitive Science, 2018.","mla":"Abramov, Olga, et al. “Towards a Computational Model of Child Gesture-Speech Production.” <i>KOGWIS2018: Computational Approaches to Cognitive Science</i>, 2018.","apa":"Abramov, O., Kopp, S., Nemeth, A., Kern, F., Mertens, U., &#38; Rohlfing, K. (2018). Towards a Computational Model of Child Gesture-Speech Production. <i>KOGWIS2018: Computational Approaches to Cognitive Science</i>.","ama":"Abramov O, Kopp S, Nemeth A, Kern F, Mertens U, Rohlfing K. Towards a Computational Model of Child Gesture-Speech Production. In: <i>KOGWIS2018: Computational Approaches to Cognitive Science</i>. ; 2018.","ieee":"O. Abramov, S. Kopp, A. Nemeth, F. Kern, U. Mertens, and K. Rohlfing, “Towards a Computational Model of Child Gesture-Speech Production,” 2018.","chicago":"Abramov, Olga, Stefan Kopp, Anne Nemeth, Friederike Kern, Ulrich Mertens, and Katharina Rohlfing. “Towards a Computational Model of Child Gesture-Speech Production.” In <i>KOGWIS2018: Computational Approaches to Cognitive Science</i>, 2018."},"year":"2018","date_created":"2020-08-03T11:00:54Z","author":[{"first_name":"Olga","full_name":"Abramov, Olga","last_name":"Abramov"},{"first_name":"Stefan","last_name":"Kopp","full_name":"Kopp, Stefan"},{"first_name":"Anne","last_name":"Nemeth","full_name":"Nemeth, Anne"},{"first_name":"Friederike","last_name":"Kern","full_name":"Kern, Friederike"},{"last_name":"Mertens","full_name":"Mertens, Ulrich","first_name":"Ulrich"},{"last_name":"Rohlfing","id":"50352","full_name":"Rohlfing, Katharina","first_name":"Katharina"}],"date_updated":"2023-02-01T12:50:21Z","title":"Towards a Computational Model of Child Gesture-Speech Production","publication":"KOGWIS2018: Computational Approaches to Cognitive Science","type":"conference","status":"public","abstract":[{"lang":"eng","text":"Previous work by [1] studied gesture-speech interaction in adults. [1] focussed on temporal and semantic coordination of gesture and speech and found that while adult speech is mostly coordinated (or redundant) with gestures, semantic coordination increases the temporal synchrony. These observations do not necessarily hold for children (in particular with respect to iconic gestures, see [2]), where the speech and gesture systems are still under development. We studied the semantic and temporal coordination of speech and gesture in 4-year old children using a corpus of 40 children producing action descriptions in task oriented dialogues. In particular, we examined what kinds of information are transmitted verbally vs. non-verbally and how they are related. To account for this, we extended the semantic features (SFs) developed in [3] for object descriptions in order to include the semantics of actions. We coded the SFs on the children’s speech and gestures separately using video data. In our presentation, we will focus on the quantitative distribution of SFs across gesture and speech. Our results indicate that speech and gestures of 4-year olds are less integrated than those of the adults, although there is a large variability among the children. We will discuss the results with respect to the cognitive processes (e.g., visual memory, language) underlying children’s abilities at this stage of development. Our work paves the way for the cognitive architecture of speech-gesture interaction in preschoolers which to our knowledge is missing so far. "}],"department":[{"_id":"749"}],"user_id":"14931","_id":"17557","language":[{"iso":"eng"}],"keyword":["Speech-gesture integration","semantic features"]},{"abstract":[{"lang":"eng","text":"Previous work by [1] studied gesture-speech interaction in adults. [1] focussed on temporal and semantic coordination of gesture and speech and found that while adult speech is mostly coordinated (or redundant) with gestures, semantic coordination increases the temporal synchrony. These observations do not necessarily hold for children (in particular with respect to iconic gestures, see [2]), where the speech and gesture systems are still under development. We studied the semantic and temporal coordination of speech and gesture in 4-year old children using a corpus of 40 children producing action descriptions in task oriented dialogues. In particular, we examined what kinds of information are transmitted verbally vs. non-verbally and how they are related. To account for this, we extended the semantic features (SFs) developed in [3] for object descriptions in order to include the semantics of actions. We coded the SFs on the children’s speech and gestures separately using video data. In our presentation, we will focus on the quantitative distribution of SFs across gesture and speech. Our results indicate that speech and gestures of 4-year olds are less integrated than those of the adults, although there is a large variability among the children. We will discuss the results with respect to the cognitive processes (e.g., visual memory, language) underlying children’s abilities at this stage of development. Our work paves the way for the cognitive architecture of speech-gesture interaction in preschoolers which to our knowledge is missing so far. "}],"status":"public","type":"conference","publication":"KOGWIS2018: Computational Approaches to Cognitive Science","keyword":["Speech-gesture integration","semantic features"],"language":[{"iso":"eng"}],"_id":"17179","user_id":"14931","department":[{"_id":"749"}],"year":"2018","citation":{"apa":"Abramov, O., Kopp, S., Nemeth, A., Kern, F., Mertens, U., &#38; Rohlfing, K. (2018). Towards a Computational Model of Child Gesture-Speech Production. <i>KOGWIS2018: Computational Approaches to Cognitive Science</i>.","short":"O. Abramov, S. Kopp, A. Nemeth, F. Kern, U. Mertens, K. Rohlfing, in: KOGWIS2018: Computational Approaches to Cognitive Science, 2018.","bibtex":"@inproceedings{Abramov_Kopp_Nemeth_Kern_Mertens_Rohlfing_2018, title={Towards a Computational Model of Child Gesture-Speech Production}, booktitle={KOGWIS2018: Computational Approaches to Cognitive Science}, author={Abramov, Olga and Kopp, Stefan and Nemeth, Anne and Kern, Friederike and Mertens, Ulrich and Rohlfing, Katharina}, year={2018} }","mla":"Abramov, Olga, et al. “Towards a Computational Model of Child Gesture-Speech Production.” <i>KOGWIS2018: Computational Approaches to Cognitive Science</i>, 2018.","ama":"Abramov O, Kopp S, Nemeth A, Kern F, Mertens U, Rohlfing K. Towards a Computational Model of Child Gesture-Speech Production. In: <i>KOGWIS2018: Computational Approaches to Cognitive Science</i>. ; 2018.","chicago":"Abramov, Olga, Stefan Kopp, Anne Nemeth, Friederike Kern, Ulrich Mertens, and Katharina Rohlfing. “Towards a Computational Model of Child Gesture-Speech Production.” In <i>KOGWIS2018: Computational Approaches to Cognitive Science</i>, 2018.","ieee":"O. Abramov, S. Kopp, A. Nemeth, F. Kern, U. Mertens, and K. Rohlfing, “Towards a Computational Model of Child Gesture-Speech Production,” 2018."},"title":"Towards a Computational Model of Child Gesture-Speech Production","date_updated":"2023-02-01T16:24:45Z","date_created":"2020-06-24T13:00:54Z","author":[{"last_name":"Abramov","full_name":"Abramov, Olga","first_name":"Olga"},{"last_name":"Kopp","full_name":"Kopp, Stefan","first_name":"Stefan"},{"first_name":"Anne","last_name":"Nemeth","full_name":"Nemeth, Anne"},{"full_name":"Kern, Friederike","last_name":"Kern","first_name":"Friederike"},{"last_name":"Mertens","full_name":"Mertens, Ulrich","first_name":"Ulrich"},{"full_name":"Rohlfing, Katharina","id":"50352","last_name":"Rohlfing","first_name":"Katharina"}]},{"main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2015/ChHa15.pdf"}],"title":"On Optimal Smoothing in Minimum Statistics Based Noise Tracking","author":[{"first_name":"Aleksej","last_name":"Chinaev","full_name":"Chinaev, Aleksej"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"date_created":"2019-07-12T05:27:19Z","date_updated":"2022-01-06T06:51:08Z","oa":"1","page":"1785-1789","citation":{"ama":"Chinaev A, Haeb-Umbach R. On Optimal Smoothing in Minimum Statistics Based Noise Tracking. In: <i>Interspeech 2015</i>. ; 2015:1785-1789.","ieee":"A. Chinaev and R. Haeb-Umbach, “On Optimal Smoothing in Minimum Statistics Based Noise Tracking,” in <i>Interspeech 2015</i>, 2015, pp. 1785–1789.","chicago":"Chinaev, Aleksej, and Reinhold Haeb-Umbach. “On Optimal Smoothing in Minimum Statistics Based Noise Tracking.” In <i>Interspeech 2015</i>, 1785–89, 2015.","apa":"Chinaev, A., &#38; Haeb-Umbach, R. (2015). On Optimal Smoothing in Minimum Statistics Based Noise Tracking. In <i>Interspeech 2015</i> (pp. 1785–1789).","mla":"Chinaev, Aleksej, and Reinhold Haeb-Umbach. “On Optimal Smoothing in Minimum Statistics Based Noise Tracking.” <i>Interspeech 2015</i>, 2015, pp. 1785–89.","bibtex":"@inproceedings{Chinaev_Haeb-Umbach_2015, title={On Optimal Smoothing in Minimum Statistics Based Noise Tracking}, booktitle={Interspeech 2015}, author={Chinaev, Aleksej and Haeb-Umbach, Reinhold}, year={2015}, pages={1785–1789} }","short":"A. Chinaev, R. Haeb-Umbach, in: Interspeech 2015, 2015, pp. 1785–1789."},"year":"2015","related_material":{"link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2015/ChHa15_Poster.pdf","description":"Poster","relation":"supplementary_material"}]},"language":[{"iso":"eng"}],"keyword":["speech enhancement","noise tracking","optimal smoothing"],"department":[{"_id":"54"}],"user_id":"44006","_id":"11739","status":"public","abstract":[{"text":"Noise tracking is an important component of speech enhancement algorithms. Of the many noise trackers proposed, Minimum Statistics (MS) is a particularly popular one due to its simple parameterization and at the same time excellent performance. In this paper we propose to further reduce the number of MS parameters by giving an alternative derivation of an optimal smoothing constant. At the same time the noise tracking performance is improved as is demonstrated by experiments employing speech degraded by various noise types and at different SNR values.","lang":"eng"}],"publication":"Interspeech 2015","type":"conference"},{"publication":"Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on","type":"conference","abstract":[{"text":"The parametric Bayesian Feature Enhancement (BFE) and a datadriven Denoising Autoencoder (DA) both bring performance gains in severe single-channel speech recognition conditions. The first can be adjusted to different conditions by an appropriate parameter setting, while the latter needs to be trained on conditions similar to the ones expected at decoding time, making it vulnerable to a mismatch between training and test conditions. We use a DNN backend and study reverberant ASR under three types of mismatch conditions: different room reverberation times, different speaker to microphone distances and the difference between artificially reverberated data and the recordings in a reverberant environment. We show that for these mismatch conditions BFE can provide the targets for a DA. This unsupervised adaptation provides a performance gain over the direct use of BFE and even enables to compensate for the mismatch of real and simulated reverberant data.","lang":"eng"}],"status":"public","_id":"11813","department":[{"_id":"54"}],"user_id":"44006","keyword":["codecs","signal denoising","speech recognition","Bayesian feature enhancement","denoising autoencoder","reverberant ASR","single-channel speech recognition","speaker to microphone distances","unsupervised adaptation","Adaptation models","Noise reduction","Reverberation","Speech","Speech recognition","Training","deep neuronal networks","denoising autoencoder","feature enhancement","robust speech recognition"],"language":[{"iso":"eng"}],"year":"2015","page":"5053-5057","citation":{"bibtex":"@inproceedings{Heymann_Haeb-Umbach_Golik_Schlueter_2015, title={Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions}, DOI={<a href=\"https://doi.org/10.1109/ICASSP.2015.7178933\">10.1109/ICASSP.2015.7178933</a>}, booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on}, author={Heymann, Jahn and Haeb-Umbach, Reinhold and Golik, P. and Schlueter, R.}, year={2015}, pages={5053–5057} }","short":"J. Heymann, R. Haeb-Umbach, P. Golik, R. Schlueter, in: Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference On, 2015, pp. 5053–5057.","mla":"Heymann, Jahn, et al. “Unsupervised Adaptation of a Denoising Autoencoder by Bayesian Feature Enhancement for Reverberant Asr under Mismatch Conditions.” <i>Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference On</i>, 2015, pp. 5053–57, doi:<a href=\"https://doi.org/10.1109/ICASSP.2015.7178933\">10.1109/ICASSP.2015.7178933</a>.","apa":"Heymann, J., Haeb-Umbach, R., Golik, P., &#38; Schlueter, R. (2015). Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions. In <i>Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on</i> (pp. 5053–5057). <a href=\"https://doi.org/10.1109/ICASSP.2015.7178933\">https://doi.org/10.1109/ICASSP.2015.7178933</a>","chicago":"Heymann, Jahn, Reinhold Haeb-Umbach, P. Golik, and R. Schlueter. “Unsupervised Adaptation of a Denoising Autoencoder by Bayesian Feature Enhancement for Reverberant Asr under Mismatch Conditions.” In <i>Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference On</i>, 5053–57, 2015. <a href=\"https://doi.org/10.1109/ICASSP.2015.7178933\">https://doi.org/10.1109/ICASSP.2015.7178933</a>.","ieee":"J. Heymann, R. Haeb-Umbach, P. Golik, and R. Schlueter, “Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions,” in <i>Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on</i>, 2015, pp. 5053–5057.","ama":"Heymann J, Haeb-Umbach R, Golik P, Schlueter R. Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions. In: <i>Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference On</i>. ; 2015:5053-5057. doi:<a href=\"https://doi.org/10.1109/ICASSP.2015.7178933\">10.1109/ICASSP.2015.7178933</a>"},"date_updated":"2022-01-06T06:51:09Z","oa":"1","author":[{"last_name":"Heymann","id":"9168","full_name":"Heymann, Jahn","first_name":"Jahn"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"},{"first_name":"P.","last_name":"Golik","full_name":"Golik, P."},{"last_name":"Schlueter","full_name":"Schlueter, R.","first_name":"R."}],"date_created":"2019-07-12T05:28:45Z","title":"Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions","doi":"10.1109/ICASSP.2015.7178933","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2015/hey_icassp_2015.pdf","open_access":"1"}]},{"_id":"57964","department":[{"_id":"890"}],"user_id":"61071","keyword":["610 Medical sciences","Medicine","reliability","speech mapping","TMS"],"language":[{"iso":"eng"}],"extern":"1","type":"conference_abstract","status":"public","date_updated":"2026-04-20T11:02:37Z","author":[{"first_name":"Julia","last_name":"Pieczewski","full_name":"Pieczewski, Julia"},{"full_name":"Neuschmelting, Volker","last_name":"Neuschmelting","first_name":"Volker"},{"first_name":"Kristina","full_name":"Thiele, Kristina","id":"94540","last_name":"Thiele","orcid":"0000-0002-1067-9139"},{"full_name":"Grefkes, Christian","last_name":"Grefkes","first_name":"Christian"},{"first_name":"Roland","full_name":"Goldbrunner, Roland","last_name":"Goldbrunner"},{"last_name":"Weiss Lucas","full_name":"Weiss Lucas, Carolin ","first_name":"Carolin "}],"date_created":"2025-01-06T12:11:42Z","title":"Good retest reliability of the rate of speech errors evoked by 10 Hz navigated repetitive transcranial magnetic stimulation in healthy volunteers: German Medical Science GMS Publishing House","doi":"10.3205/15dgnc394","year":"2015","citation":{"bibtex":"@inproceedings{Pieczewski_Neuschmelting_Thiele_Grefkes_Goldbrunner_Weiss Lucas_2015, title={Good retest reliability of the rate of speech errors evoked by 10 Hz navigated repetitive transcranial magnetic stimulation in healthy volunteers: German Medical Science GMS Publishing House}, DOI={<a href=\"https://doi.org/10.3205/15dgnc394\">10.3205/15dgnc394</a>}, author={Pieczewski, Julia and Neuschmelting, Volker and Thiele, Kristina and Grefkes, Christian and Goldbrunner, Roland and Weiss Lucas, Carolin }, year={2015} }","short":"J. Pieczewski, V. Neuschmelting, K. Thiele, C. Grefkes, R. Goldbrunner, C. Weiss Lucas, in: 2015.","mla":"Pieczewski, Julia, et al. <i>Good Retest Reliability of the Rate of Speech Errors Evoked by 10 Hz Navigated Repetitive Transcranial Magnetic Stimulation in Healthy Volunteers: German Medical Science GMS Publishing House</i>. 2015, doi:<a href=\"https://doi.org/10.3205/15dgnc394\">10.3205/15dgnc394</a>.","apa":"Pieczewski, J., Neuschmelting, V., Thiele, K., Grefkes, C., Goldbrunner, R., &#38; Weiss Lucas, C. (2015). <i>Good retest reliability of the rate of speech errors evoked by 10 Hz navigated repetitive transcranial magnetic stimulation in healthy volunteers: German Medical Science GMS Publishing House</i>. <a href=\"https://doi.org/10.3205/15dgnc394\">https://doi.org/10.3205/15dgnc394</a>","ieee":"J. Pieczewski, V. Neuschmelting, K. Thiele, C. Grefkes, R. Goldbrunner, and C. Weiss Lucas, “Good retest reliability of the rate of speech errors evoked by 10 Hz navigated repetitive transcranial magnetic stimulation in healthy volunteers: German Medical Science GMS Publishing House,” 2015, doi: <a href=\"https://doi.org/10.3205/15dgnc394\">10.3205/15dgnc394</a>.","chicago":"Pieczewski, Julia, Volker Neuschmelting, Kristina Thiele, Christian Grefkes, Roland Goldbrunner, and Carolin  Weiss Lucas. “Good Retest Reliability of the Rate of Speech Errors Evoked by 10 Hz Navigated Repetitive Transcranial Magnetic Stimulation in Healthy Volunteers: German Medical Science GMS Publishing House,” 2015. <a href=\"https://doi.org/10.3205/15dgnc394\">https://doi.org/10.3205/15dgnc394</a>.","ama":"Pieczewski J, Neuschmelting V, Thiele K, Grefkes C, Goldbrunner R, Weiss Lucas C. Good retest reliability of the rate of speech errors evoked by 10 Hz navigated repetitive transcranial magnetic stimulation in healthy volunteers: German Medical Science GMS Publishing House. In: ; 2015. doi:<a href=\"https://doi.org/10.3205/15dgnc394\">10.3205/15dgnc394</a>"}},{"language":[{"iso":"eng"}],"keyword":["Accuracy","Acoustics","Estimation","Mathematical model","Soruce separation","Speech","Vectors","Bayes methods","Blind source separation","Directional statistics","Number of speakers","Speaker diarization"],"department":[{"_id":"54"}],"user_id":"44006","_id":"11753","status":"public","abstract":[{"text":"This contribution describes a step-wise source counting algorithm to determine the number of speakers in an offline scenario. Each speaker is identified by a variational expectation maximization (VEM) algorithm for complex Watson mixture models and therefore directly yields beamforming vectors for a subsequent speech separation process. An observation selection criterion is proposed which improves the robustness of the source counting in noise. The algorithm is compared to an alternative VEM approach with Gaussian mixture models based on directions of arrival and shown to deliver improved source counting accuracy. The article concludes by extending the offline algorithm towards a low-latency online estimation of the number of active sources from the streaming input data.","lang":"eng"}],"publication":"14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)","type":"conference","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2014/DrChTrHaeb14.pdf","open_access":"1"}],"title":"Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models","date_created":"2019-07-12T05:27:35Z","author":[{"id":"11213","full_name":"Drude, Lukas","last_name":"Drude","first_name":"Lukas"},{"first_name":"Aleksej","last_name":"Chinaev","full_name":"Chinaev, Aleksej"},{"full_name":"Tran Vu, Dang Hai","last_name":"Tran Vu","first_name":"Dang Hai"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"date_updated":"2022-01-06T06:51:08Z","oa":"1","page":"213-217","citation":{"ama":"Drude L, Chinaev A, Tran Vu DH, Haeb-Umbach R. Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models. In: <i>14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)</i>. ; 2014:213-217.","ieee":"L. Drude, A. Chinaev, D. H. Tran Vu, and R. Haeb-Umbach, “Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models,” in <i>14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)</i>, 2014, pp. 213–217.","chicago":"Drude, Lukas, Aleksej Chinaev, Dang Hai Tran Vu, and Reinhold Haeb-Umbach. “Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models.” In <i>14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)</i>, 213–17, 2014.","short":"L. Drude, A. Chinaev, D.H. Tran Vu, R. Haeb-Umbach, in: 14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014), 2014, pp. 213–217.","mla":"Drude, Lukas, et al. “Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models.” <i>14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)</i>, 2014, pp. 213–17.","bibtex":"@inproceedings{Drude_Chinaev_Tran Vu_Haeb-Umbach_2014, title={Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models}, booktitle={14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)}, author={Drude, Lukas and Chinaev, Aleksej and Tran Vu, Dang Hai and Haeb-Umbach, Reinhold}, year={2014}, pages={213–217} }","apa":"Drude, L., Chinaev, A., Tran Vu, D. H., &#38; Haeb-Umbach, R. (2014). Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models. In <i>14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)</i> (pp. 213–217)."},"year":"2014","related_material":{"link":[{"relation":"supplementary_material","description":"Poster","url":"https://groups.uni-paderborn.de/nt/pubs/2014/DrChTrHaeb14_Poster.pdf"}]}},{"title":"A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech","doi":"10.1109/TASLP.2013.2285480","date_updated":"2022-01-06T06:51:11Z","volume":22,"author":[{"last_name":"Leutnant","full_name":"Leutnant, Volker","first_name":"Volker"},{"full_name":"Krueger, Alexander","last_name":"Krueger","first_name":"Alexander"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_created":"2019-07-12T05:29:41Z","year":"2014","intvolume":"        22","page":"95-109","citation":{"ieee":"V. Leutnant, A. Krueger, and R. Haeb-Umbach, “A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech,” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, vol. 22, no. 1, pp. 95–109, 2014.","chicago":"Leutnant, Volker, Alexander Krueger, and Reinhold Haeb-Umbach. “A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i> 22, no. 1 (2014): 95–109. <a href=\"https://doi.org/10.1109/TASLP.2013.2285480\">https://doi.org/10.1109/TASLP.2013.2285480</a>.","ama":"Leutnant V, Krueger A, Haeb-Umbach R. A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>. 2014;22(1):95-109. doi:<a href=\"https://doi.org/10.1109/TASLP.2013.2285480\">10.1109/TASLP.2013.2285480</a>","bibtex":"@article{Leutnant_Krueger_Haeb-Umbach_2014, title={A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech}, volume={22}, DOI={<a href=\"https://doi.org/10.1109/TASLP.2013.2285480\">10.1109/TASLP.2013.2285480</a>}, number={1}, journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, author={Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}, year={2014}, pages={95–109} }","mla":"Leutnant, Volker, et al. “A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech.” <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, vol. 22, no. 1, 2014, pp. 95–109, doi:<a href=\"https://doi.org/10.1109/TASLP.2013.2285480\">10.1109/TASLP.2013.2285480</a>.","short":"V. Leutnant, A. Krueger, R. Haeb-Umbach, IEEE/ACM Transactions on Audio, Speech, and Language Processing 22 (2014) 95–109.","apa":"Leutnant, V., Krueger, A., &#38; Haeb-Umbach, R. (2014). A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech. <i>IEEE/ACM Transactions on Audio, Speech, and Language Processing</i>, <i>22</i>(1), 95–109. <a href=\"https://doi.org/10.1109/TASLP.2013.2285480\">https://doi.org/10.1109/TASLP.2013.2285480</a>"},"publication_identifier":{"issn":["2329-9290"]},"issue":"1","keyword":["computational complexity","reverberation","speech recognition","automatic speech recognition","background noise","clean speech","computational complexity","energy compensation","logarithmic mel power spectral domain","mel frequency cepstral coefficients","microphone input signals","model-based feature compensation schemes","noisy reverberant speech automatic recognition","noisy reverberant speech features","reverberation","Atmospheric modeling","Computational modeling","Noise","Noise measurement","Reverberation","Speech","Vectors","Model-based feature compensation","observation model for reverberant and noisy speech","recursive observation model","robust automatic speech recognition"],"language":[{"iso":"eng"}],"_id":"11861","department":[{"_id":"54"}],"user_id":"44006","abstract":[{"lang":"eng","text":"In this contribution we present a theoretical and experimental investigation into the effects of reverberation and noise on features in the logarithmic mel power spectral domain, an intermediate stage in the computation of the mel frequency cepstral coefficients, prevalent in automatic speech recognition (ASR). Gaining insight into the complex interaction between clean speech, noise, and noisy reverberant speech features is essential for any ASR system to be robust against noise and reverberation present in distant microphone input signals. The findings are gathered in a probabilistic formulation of an observation model which may be used in model-based feature compensation schemes. The proposed observation model extends previous models in three major directions: First, the contribution of additive background noise to the observation error is explicitly taken into account. Second, an energy compensation constant is introduced which ensures an unbiased estimate of the reverberant speech features, and, third, a recursive variant of the observation model is developed resulting in reduced computational complexity when used in model-based feature compensation. The experimental section is used to evaluate the accuracy of the model and to describe how its parameters can be determined from test data."}],"status":"public","publication":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","type":"journal_article"},{"main_file_link":[{"open_access":"1","url":"http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6732927"}],"doi":"10.1109/TASLP.2014.2304637","title":"An Overview of Noise-Robust Automatic Speech Recognition","date_created":"2019-07-12T05:29:47Z","author":[{"full_name":"Li, Jinyu","last_name":"Li","first_name":"Jinyu"},{"full_name":"Deng, Li","last_name":"Deng","first_name":"Li"},{"first_name":"Yifan","last_name":"Gong","full_name":"Gong, Yifan"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"volume":22,"date_updated":"2022-01-06T06:51:11Z","oa":"1","citation":{"ama":"Li J, Deng L, Gong Y, Haeb-Umbach R. An Overview of Noise-Robust Automatic Speech Recognition. <i>IEEE Transactions on Audio, Speech and Language Processing</i>. 2014;22(4):745-777. doi:<a href=\"https://doi.org/10.1109/TASLP.2014.2304637\">10.1109/TASLP.2014.2304637</a>","chicago":"Li, Jinyu, Li Deng, Yifan Gong, and Reinhold Haeb-Umbach. “An Overview of Noise-Robust Automatic Speech Recognition.” <i>IEEE Transactions on Audio, Speech and Language Processing</i> 22, no. 4 (2014): 745–77. <a href=\"https://doi.org/10.1109/TASLP.2014.2304637\">https://doi.org/10.1109/TASLP.2014.2304637</a>.","ieee":"J. Li, L. Deng, Y. Gong, and R. Haeb-Umbach, “An Overview of Noise-Robust Automatic Speech Recognition,” <i>IEEE Transactions on Audio, Speech and Language Processing</i>, vol. 22, no. 4, pp. 745–777, 2014.","bibtex":"@article{Li_Deng_Gong_Haeb-Umbach_2014, title={An Overview of Noise-Robust Automatic Speech Recognition}, volume={22}, DOI={<a href=\"https://doi.org/10.1109/TASLP.2014.2304637\">10.1109/TASLP.2014.2304637</a>}, number={4}, journal={IEEE Transactions on Audio, Speech and Language Processing}, author={Li, Jinyu and Deng, Li and Gong, Yifan and Haeb-Umbach, Reinhold}, year={2014}, pages={745–777} }","mla":"Li, Jinyu, et al. “An Overview of Noise-Robust Automatic Speech Recognition.” <i>IEEE Transactions on Audio, Speech and Language Processing</i>, vol. 22, no. 4, 2014, pp. 745–77, doi:<a href=\"https://doi.org/10.1109/TASLP.2014.2304637\">10.1109/TASLP.2014.2304637</a>.","short":"J. Li, L. Deng, Y. Gong, R. Haeb-Umbach, IEEE Transactions on Audio, Speech and Language Processing 22 (2014) 745–777.","apa":"Li, J., Deng, L., Gong, Y., &#38; Haeb-Umbach, R. (2014). An Overview of Noise-Robust Automatic Speech Recognition. <i>IEEE Transactions on Audio, Speech and Language Processing</i>, <i>22</i>(4), 745–777. <a href=\"https://doi.org/10.1109/TASLP.2014.2304637\">https://doi.org/10.1109/TASLP.2014.2304637</a>"},"intvolume":"        22","page":"745-777","year":"2014","issue":"4","language":[{"iso":"eng"}],"keyword":["Speech recognition","compensation","distortion modeling","joint model training","noise","robustness","uncertainty processing"],"user_id":"44006","department":[{"_id":"54"}],"_id":"11867","status":"public","abstract":[{"text":"New waves of consumer-centric applications, such as voice search and voice interaction with mobile devices and home entertainment systems, increasingly require automatic speech recognition (ASR) to be robust to the full range of real-world noise and other acoustic distorting conditions. Despite its practical importance, however, the inherent links between and distinctions among the myriad of methods for noise-robust ASR have yet to be carefully studied in order to advance the field further. To this end, it is critical to establish a solid, consistent, and common mathematical foundation for noise-robust ASR, which is lacking at present. This article is intended to fill this gap and to provide a thorough overview of modern noise-robust techniques for ASR developed over the past 30 years. We emphasize methods that are proven to be successful and that are likely to sustain or expand their future applicability. We distill key insights from our comprehensive overview in this field and take a fresh look at a few old problems, which nevertheless are still highly relevant today. Specifically, we have analyzed and categorized a wide range of noise-robust techniques using five different criteria: 1) feature-domain vs. model-domain processing, 2) the use of prior knowledge about the acoustic environment distortion, 3) the use of explicit environment-distortion models, 4) deterministic vs. uncertainty processing, and 5) the use of acoustic models trained jointly with the same feature enhancement or model adaptation process used in the testing stage. With this taxonomy-oriented review, we equip the reader with the insight to choose among techniques and with the awareness of the performance-complexity tradeoffs. The pros and cons of using different noise-robust ASR techniques in practical application scenarios are provided as a guide to interested practitioners. The current challenges and future research directions in this field is also carefully analyzed.","lang":"eng"}],"type":"journal_article","publication":"IEEE Transactions on Audio, Speech and Language Processing"},{"doi":"10.1109/ICASSP.2013.6638984","title":"GMM-based significance decoding","author":[{"last_name":"Abdelaziz","full_name":"Abdelaziz, Ahmed H.","first_name":"Ahmed H."},{"first_name":"Steffen","last_name":"Zeiler","full_name":"Zeiler, Steffen"},{"first_name":"Dorothea","last_name":"Kolossa","full_name":"Kolossa, Dorothea"},{"first_name":"Volker","last_name":"Leutnant","full_name":"Leutnant, Volker"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"date_created":"2019-07-12T05:26:53Z","date_updated":"2022-01-06T06:51:07Z","page":"6827-6831","citation":{"apa":"Abdelaziz, A. H., Zeiler, S., Kolossa, D., Leutnant, V., &#38; Haeb-Umbach, R. (2013). GMM-based significance decoding. In <i>Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on</i> (pp. 6827–6831). <a href=\"https://doi.org/10.1109/ICASSP.2013.6638984\">https://doi.org/10.1109/ICASSP.2013.6638984</a>","bibtex":"@inproceedings{Abdelaziz_Zeiler_Kolossa_Leutnant_Haeb-Umbach_2013, title={GMM-based significance decoding}, DOI={<a href=\"https://doi.org/10.1109/ICASSP.2013.6638984\">10.1109/ICASSP.2013.6638984</a>}, booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on}, author={Abdelaziz, Ahmed H. and Zeiler, Steffen and Kolossa, Dorothea and Leutnant, Volker and Haeb-Umbach, Reinhold}, year={2013}, pages={6827–6831} }","short":"A.H. Abdelaziz, S. Zeiler, D. Kolossa, V. Leutnant, R. Haeb-Umbach, in: Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference On, 2013, pp. 6827–6831.","mla":"Abdelaziz, Ahmed H., et al. “GMM-Based Significance Decoding.” <i>Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference On</i>, 2013, pp. 6827–31, doi:<a href=\"https://doi.org/10.1109/ICASSP.2013.6638984\">10.1109/ICASSP.2013.6638984</a>.","ieee":"A. H. Abdelaziz, S. Zeiler, D. Kolossa, V. Leutnant, and R. Haeb-Umbach, “GMM-based significance decoding,” in <i>Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on</i>, 2013, pp. 6827–6831.","chicago":"Abdelaziz, Ahmed H., Steffen Zeiler, Dorothea Kolossa, Volker Leutnant, and Reinhold Haeb-Umbach. “GMM-Based Significance Decoding.” In <i>Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference On</i>, 6827–31, 2013. <a href=\"https://doi.org/10.1109/ICASSP.2013.6638984\">https://doi.org/10.1109/ICASSP.2013.6638984</a>.","ama":"Abdelaziz AH, Zeiler S, Kolossa D, Leutnant V, Haeb-Umbach R. GMM-based significance decoding. In: <i>Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference On</i>. ; 2013:6827-6831. doi:<a href=\"https://doi.org/10.1109/ICASSP.2013.6638984\">10.1109/ICASSP.2013.6638984</a>"},"year":"2013","publication_identifier":{"issn":["1520-6149"]},"language":[{"iso":"eng"}],"keyword":["Bayes methods","Gaussian processes","convolution","decision theory","decoding","noise","reverberation","speech coding","speech recognition","Bayesian decision rule","GMM","Gaussian mixture models","additive noise scenarios","automatic speech recognition systems","convolutive noise scenarios","decoding approach","mathematical framework","reverberant environments","significance decoding","speech feature estimation","uncertainty-of-observation techniques","Hidden Markov models","Maximum likelihood decoding","Noise","Speech","Speech recognition","Uncertainty","Uncertainty-of-observation","modified imputation","noise robust speech recognition","significance decoding","uncertainty decoding"],"department":[{"_id":"54"}],"user_id":"44006","_id":"11716","status":"public","abstract":[{"text":"The accuracy of automatic speech recognition systems in noisy and reverberant environments can be improved notably by exploiting the uncertainty of the estimated speech features using so-called uncertainty-of-observation techniques. In this paper, we introduce a new Bayesian decision rule that can serve as a mathematical framework from which both known and new uncertainty-of-observation techniques can be either derived or approximated. The new decision rule in its direct form leads to the new significance decoding approach for Gaussian mixture models, which results in better performance compared to standard uncertainty-of-observation techniques in different additive and convolutive noise scenarios.","lang":"eng"}],"publication":"Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on","type":"conference"},{"year":"2013","citation":{"ama":"Kinoshita K, Delcroix M, Yoshioka T, et al. The reverb challenge: a common evaluation framework for dereverberation and recognition of reverberant speech. In: <i> IEEE Workshop on Applications of Signal Processing to Audio and Acoustics </i>. ; 2013:22-23.","ieee":"K. Kinoshita <i>et al.</i>, “The reverb challenge: a common evaluation framework for dereverberation and recognition of reverberant speech,” in <i> IEEE Workshop on Applications of Signal Processing to Audio and Acoustics </i>, 2013, pp. 22–23.","chicago":"Kinoshita, Keisuke, Marc Delcroix, Takuya Yoshioka, Tomohiro Nakatani, Emanuel Habets, Reinhold Haeb-Umbach, Volker Leutnant, et al. “The Reverb Challenge: A Common Evaluation Framework for Dereverberation and Recognition of Reverberant Speech.” In <i> IEEE Workshop on Applications of Signal Processing to Audio and Acoustics </i>, 22–23, 2013.","bibtex":"@inproceedings{Kinoshita_Delcroix_Yoshioka_Nakatani_Habets_Haeb-Umbach_Leutnant_Sehr_Kellermann_Maas_et al._2013, title={The reverb challenge: a common evaluation framework for dereverberation and recognition of reverberant speech}, booktitle={ IEEE Workshop on Applications of Signal Processing to Audio and Acoustics }, author={Kinoshita, Keisuke and Delcroix, Marc and Yoshioka, Takuya and Nakatani, Tomohiro and Habets, Emanuel and Haeb-Umbach, Reinhold and Leutnant, Volker and Sehr, Armin and Kellermann, Walter and Maas, Roland and et al.}, year={2013}, pages={22–23} }","short":"K. Kinoshita, M. Delcroix, T. Yoshioka, T. Nakatani, E. Habets, R. Haeb-Umbach, V. Leutnant, A. Sehr, W. Kellermann, R. Maas, S. Gannot, B. Raj, in:  IEEE Workshop on Applications of Signal Processing to Audio and Acoustics , 2013, pp. 22–23.","mla":"Kinoshita, Keisuke, et al. “The Reverb Challenge: A Common Evaluation Framework for Dereverberation and Recognition of Reverberant Speech.” <i> IEEE Workshop on Applications of Signal Processing to Audio and Acoustics </i>, 2013, pp. 22–23.","apa":"Kinoshita, K., Delcroix, M., Yoshioka, T., Nakatani, T., Habets, E., Haeb-Umbach, R., … Raj, B. (2013). The reverb challenge: a common evaluation framework for dereverberation and recognition of reverberant speech. In <i> IEEE Workshop on Applications of Signal Processing to Audio and Acoustics </i> (pp. 22–23)."},"page":" 22-23 ","oa":"1","date_updated":"2022-01-06T06:51:11Z","author":[{"first_name":"Keisuke","last_name":"Kinoshita","full_name":"Kinoshita, Keisuke"},{"first_name":"Marc","full_name":"Delcroix, Marc","last_name":"Delcroix"},{"full_name":"Yoshioka, Takuya","last_name":"Yoshioka","first_name":"Takuya"},{"first_name":"Tomohiro","last_name":"Nakatani","full_name":"Nakatani, Tomohiro"},{"full_name":"Habets, Emanuel","last_name":"Habets","first_name":"Emanuel"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"},{"first_name":"Volker","full_name":"Leutnant, Volker","last_name":"Leutnant"},{"first_name":"Armin","full_name":"Sehr, Armin","last_name":"Sehr"},{"last_name":"Kellermann","full_name":"Kellermann, Walter","first_name":"Walter"},{"first_name":"Roland","full_name":"Maas, Roland","last_name":"Maas"},{"full_name":"Gannot, Sharon","last_name":"Gannot","first_name":"Sharon"},{"full_name":"Raj, Bhiksha","last_name":"Raj","first_name":"Bhiksha"}],"date_created":"2019-07-12T05:29:17Z","title":"The reverb challenge: a common evaluation framework for dereverberation and recognition of reverberant speech","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2013/Reverb2013.pdf"}],"type":"conference","publication":" IEEE Workshop on Applications of Signal Processing to Audio and Acoustics ","abstract":[{"lang":"eng","text":"Recently, substantial progress has been made in the field of reverberant speech signal processing, including both single- and multichannel de-reverberation techniques, and automatic speech recognition (ASR) techniques robust to reverberation. To evaluate state-of-the-art algorithms and obtain new insights regarding potential future research directions, we propose a common evaluation framework including datasets, tasks, and evaluation metrics for both speech enhancement and ASR techniques. The proposed framework will be used as a common basis for the REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge. This paper describes the rationale behind the challenge, and provides a detailed description of the evaluation framework and benchmark results."}],"status":"public","_id":"11841","user_id":"44006","department":[{"_id":"54"}],"keyword":["Reverberant speech","dereverberation","ASR","evaluation","challenge"],"language":[{"iso":"eng"}]},{"language":[{"iso":"eng"}],"keyword":["Bayes methods","compensation","error statistics","reverberation","speech recognition","Bayesian feature enhancement","background noise","clean speech feature vectors","compensation","connected digits recognition task","error statistics","memory requirements","noisy reverberant data","posteriori probability density function","recursive formulation","reverberant logarithmic mel power spectral coefficients","robust automatic speech recognition","signal-to-noise ratios","time-variant observation","word error rate reduction","Robust automatic speech recognition","model-based Bayesian feature enhancement","observation model for reverberant and noisy speech","recursive observation model"],"user_id":"44006","department":[{"_id":"54"}],"_id":"11862","status":"public","abstract":[{"text":"In this contribution we extend a previously proposed Bayesian approach for the enhancement of reverberant logarithmic mel power spectral coefficients for robust automatic speech recognition to the additional compensation of background noise. A recently proposed observation model is employed whose time-variant observation error statistics are obtained as a side product of the inference of the a posteriori probability density function of the clean speech feature vectors. Further a reduction of the computational effort and the memory requirements are achieved by using a recursive formulation of the observation model. The performance of the proposed algorithms is first experimentally studied on a connected digits recognition task with artificially created noisy reverberant data. It is shown that the use of the time-variant observation error model leads to a significant error rate reduction at low signal-to-noise ratios compared to a time-invariant model. Further experiments were conducted on a 5000 word task recorded in a reverberant and noisy environment. A significant word error rate reduction was obtained demonstrating the effectiveness of the approach on real-world data.","lang":"eng"}],"type":"journal_article","publication":"IEEE Transactions on Audio, Speech, and Language Processing","doi":"10.1109/TASL.2013.2258013","title":"Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition","date_created":"2019-07-12T05:29:42Z","author":[{"last_name":"Leutnant","full_name":"Leutnant, Volker","first_name":"Volker"},{"full_name":"Krueger, Alexander","last_name":"Krueger","first_name":"Alexander"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"volume":21,"date_updated":"2022-01-06T06:51:11Z","citation":{"ama":"Leutnant V, Krueger A, Haeb-Umbach R. Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>. 2013;21(8):1640-1652. doi:<a href=\"https://doi.org/10.1109/TASL.2013.2258013\">10.1109/TASL.2013.2258013</a>","chicago":"Leutnant, Volker, Alexander Krueger, and Reinhold Haeb-Umbach. “Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i> 21, no. 8 (2013): 1640–52. <a href=\"https://doi.org/10.1109/TASL.2013.2258013\">https://doi.org/10.1109/TASL.2013.2258013</a>.","ieee":"V. Leutnant, A. Krueger, and R. Haeb-Umbach, “Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition,” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 21, no. 8, pp. 1640–1652, 2013.","apa":"Leutnant, V., Krueger, A., &#38; Haeb-Umbach, R. (2013). Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, <i>21</i>(8), 1640–1652. <a href=\"https://doi.org/10.1109/TASL.2013.2258013\">https://doi.org/10.1109/TASL.2013.2258013</a>","bibtex":"@article{Leutnant_Krueger_Haeb-Umbach_2013, title={Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition}, volume={21}, DOI={<a href=\"https://doi.org/10.1109/TASL.2013.2258013\">10.1109/TASL.2013.2258013</a>}, number={8}, journal={IEEE Transactions on Audio, Speech, and Language Processing}, author={Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}, year={2013}, pages={1640–1652} }","short":"V. Leutnant, A. Krueger, R. Haeb-Umbach, IEEE Transactions on Audio, Speech, and Language Processing 21 (2013) 1640–1652.","mla":"Leutnant, Volker, et al. “Bayesian Feature Enhancement for Reverberation and Noise Robust Speech Recognition.” <i>IEEE Transactions on Audio, Speech, and Language Processing</i>, vol. 21, no. 8, 2013, pp. 1640–52, doi:<a href=\"https://doi.org/10.1109/TASL.2013.2258013\">10.1109/TASL.2013.2258013</a>."},"intvolume":"        21","page":"1640-1652","year":"2013","issue":"8"},{"publication_identifier":{"issn":["1520-6149"]},"citation":{"short":"D.H.T. Vu, R. Haeb-Umbach, in: 38th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2013), 2013, pp. 863–867.","bibtex":"@inproceedings{Vu_Haeb-Umbach_2013, title={Using the turbo principle for exploiting temporal and spectral correlations in speech presence probability estimation}, DOI={<a href=\"https://doi.org/10.1109/ICASSP.2013.6637771\">10.1109/ICASSP.2013.6637771</a>}, booktitle={38th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2013)}, author={Vu, Dang Hai Tran and Haeb-Umbach, Reinhold}, year={2013}, pages={863–867} }","mla":"Vu, Dang Hai Tran, and Reinhold Haeb-Umbach. “Using the Turbo Principle for Exploiting Temporal and Spectral Correlations in Speech Presence Probability Estimation.” <i>38th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2013)</i>, 2013, pp. 863–67, doi:<a href=\"https://doi.org/10.1109/ICASSP.2013.6637771\">10.1109/ICASSP.2013.6637771</a>.","apa":"Vu, D. H. T., &#38; Haeb-Umbach, R. (2013). Using the turbo principle for exploiting temporal and spectral correlations in speech presence probability estimation. In <i>38th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2013)</i> (pp. 863–867). <a href=\"https://doi.org/10.1109/ICASSP.2013.6637771\">https://doi.org/10.1109/ICASSP.2013.6637771</a>","ama":"Vu DHT, Haeb-Umbach R. Using the turbo principle for exploiting temporal and spectral correlations in speech presence probability estimation. In: <i>38th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2013)</i>. ; 2013:863-867. doi:<a href=\"https://doi.org/10.1109/ICASSP.2013.6637771\">10.1109/ICASSP.2013.6637771</a>","ieee":"D. H. T. Vu and R. Haeb-Umbach, “Using the turbo principle for exploiting temporal and spectral correlations in speech presence probability estimation,” in <i>38th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2013)</i>, 2013, pp. 863–867.","chicago":"Vu, Dang Hai Tran, and Reinhold Haeb-Umbach. “Using the Turbo Principle for Exploiting Temporal and Spectral Correlations in Speech Presence Probability Estimation.” In <i>38th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2013)</i>, 863–67, 2013. <a href=\"https://doi.org/10.1109/ICASSP.2013.6637771\">https://doi.org/10.1109/ICASSP.2013.6637771</a>."},"page":"863-867","year":"2013","date_created":"2019-07-12T05:30:45Z","author":[{"first_name":"Dang Hai Tran","full_name":"Vu, Dang Hai Tran","last_name":"Vu"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"date_updated":"2022-01-06T06:51:12Z","doi":"10.1109/ICASSP.2013.6637771","title":"Using the turbo principle for exploiting temporal and spectral correlations in speech presence probability estimation","type":"conference","publication":"38th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2013)","status":"public","abstract":[{"lang":"eng","text":"In this paper we present a speech presence probability (SPP) estimation algorithmwhich exploits both temporal and spectral correlations of speech. To this end, the SPP estimation is formulated as the posterior probability estimation of the states of a two-dimensional (2D) Hidden Markov Model (HMM). We derive an iterative algorithm to decode the 2D-HMM which is based on the turbo principle. The experimental results show that indeed the SPP estimates improve from iteration to iteration, and further clearly outperform another state-of-the-art SPP estimation algorithm."}],"user_id":"44006","department":[{"_id":"54"}],"_id":"11917","language":[{"iso":"eng"}],"keyword":["correlation methods","estimation theory","hidden Markov models","iterative methods","probability","spectral analysis","speech processing","2D HMM","SPP estimates","iterative algorithm","posterior probability estimation","spectral correlation","speech presence probability estimation","state-of-the-art SPP estimation algorithm","temporal correlation","turbo principle","two-dimensional hidden Markov model","Correlation","Decoding","Estimation","Iterative decoding","Noise","Speech","Vectors"]},{"department":[{"_id":"54"}],"user_id":"44006","_id":"11745","language":[{"iso":"eng"}],"keyword":["MAP parameter estimation","noise power estimation","speech enhancement"],"publication":"37th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2012)","type":"conference","status":"public","abstract":[{"text":"In this paper we present a novel noise power spectral density tracking algorithm and its use in single-channel speech enhancement. It has the unique feature that it is able to track the noise statistics even if speech is dominant in a given time-frequency bin. As a consequence it can follow non-stationary noise superposed by speech, even in the critical case of rising noise power. The algorithm requires an initial estimate of the power spectrum of speech and is thus meant to be used as a postprocessor to a first speech enhancement stage. An experimental comparison with a state-of-the-art noise tracking algorithm demonstrates lower estimation errors under low SNR conditions and smaller fluctuations of the estimated values, resulting in improved speech quality as measured by PESQ scores.","lang":"eng"}],"date_created":"2019-07-12T05:27:26Z","author":[{"last_name":"Chinaev","full_name":"Chinaev, Aleksej","first_name":"Aleksej"},{"first_name":"Alexander","full_name":"Krueger, Alexander","last_name":"Krueger"},{"last_name":"Tran Vu","full_name":"Tran Vu, Dang Hai","first_name":"Dang Hai"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"date_updated":"2022-01-06T06:51:08Z","oa":"1","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2012/ChKrDaHa12.pdf"}],"title":"Improved Noise Power Spectral Density Tracking by a MAP-based Postprocessor","related_material":{"link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2012/ChKrDaHa12_Talk.pdf","relation":"supplementary_material","description":"Presentation"}]},"citation":{"ama":"Chinaev A, Krueger A, Tran Vu DH, Haeb-Umbach R. Improved Noise Power Spectral Density Tracking by a MAP-based Postprocessor. In: <i>37th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2012)</i>. ; 2012.","ieee":"A. Chinaev, A. Krueger, D. H. Tran Vu, and R. Haeb-Umbach, “Improved Noise Power Spectral Density Tracking by a MAP-based Postprocessor,” in <i>37th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2012)</i>, 2012.","chicago":"Chinaev, Aleksej, Alexander Krueger, Dang Hai Tran Vu, and Reinhold Haeb-Umbach. “Improved Noise Power Spectral Density Tracking by a MAP-Based Postprocessor.” In <i>37th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2012)</i>, 2012.","bibtex":"@inproceedings{Chinaev_Krueger_Tran Vu_Haeb-Umbach_2012, title={Improved Noise Power Spectral Density Tracking by a MAP-based Postprocessor}, booktitle={37th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2012)}, author={Chinaev, Aleksej and Krueger, Alexander and Tran Vu, Dang Hai and Haeb-Umbach, Reinhold}, year={2012} }","mla":"Chinaev, Aleksej, et al. “Improved Noise Power Spectral Density Tracking by a MAP-Based Postprocessor.” <i>37th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2012)</i>, 2012.","short":"A. Chinaev, A. Krueger, D.H. Tran Vu, R. Haeb-Umbach, in: 37th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2012), 2012.","apa":"Chinaev, A., Krueger, A., Tran Vu, D. H., &#38; Haeb-Umbach, R. (2012). Improved Noise Power Spectral Density Tracking by a MAP-based Postprocessor. In <i>37th International Conference on Acoustics, Speech and Signal Processing (ICASSP 2012)</i>."},"year":"2012"},{"abstract":[{"text":"In this work, an observation model for the joint compensation of noise and reverberation in the logarithmic mel power spectral density domain is considered. It relates the features of the noisy reverberant speech to those of the non-reverberant speech and the noise. In contrast to enhancement of features only corrupted by reverberation (reverberant features), enhancement of noisy reverberant features requires a more sophisticated model for the error introduced by the proposed observation model. In a first consideration, it will be shown that this error is highly dependent on the instantaneous ratio of the power of reverberant speech to the power of the noise and, moreover, sensitive to the phase between reverberant speech and noise in the short-time discrete Fourier domain. Afterwards, a statistically motivated approach will be presented allowing for the model of the observation error to be inferred from the error model previously used for the reverberation only case. Finally, the developed observation error model will be utilized in a Bayesian feature enhancement scheme, leading to improvements in word accuracy on the AURORA5 database.","lang":"eng"}],"status":"public","publication":"Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference on","type":"conference","keyword":["Robust Automatic Speech Recognition","Bayesian feature enhancement","observation model for reverberant and noisy speech"],"language":[{"iso":"eng"}],"_id":"11864","department":[{"_id":"54"}],"user_id":"44006","year":"2012","citation":{"ama":"Leutnant V, Krueger A, Haeb-Umbach R. A Statistical Observation Model For Noisy Reverberant Speech Features and its Application to Robust ASR. In: <i>Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference On</i>. ; 2012.","chicago":"Leutnant, Volker, Alexander Krueger, and Reinhold Haeb-Umbach. “A Statistical Observation Model For Noisy Reverberant Speech Features and Its Application to Robust ASR.” In <i>Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference On</i>, 2012.","ieee":"V. Leutnant, A. Krueger, and R. Haeb-Umbach, “A Statistical Observation Model For Noisy Reverberant Speech Features and its Application to Robust ASR,” in <i>Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference on</i>, 2012.","apa":"Leutnant, V., Krueger, A., &#38; Haeb-Umbach, R. (2012). A Statistical Observation Model For Noisy Reverberant Speech Features and its Application to Robust ASR. In <i>Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference on</i>.","mla":"Leutnant, Volker, et al. “A Statistical Observation Model For Noisy Reverberant Speech Features and Its Application to Robust ASR.” <i>Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference On</i>, 2012.","short":"V. Leutnant, A. Krueger, R. Haeb-Umbach, in: Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference On, 2012.","bibtex":"@inproceedings{Leutnant_Krueger_Haeb-Umbach_2012, title={A Statistical Observation Model For Noisy Reverberant Speech Features and its Application to Robust ASR}, booktitle={Signal Processing, Communications and Computing (ICSPCC), 2012 IEEE International Conference on}, author={Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}, year={2012} }"},"title":"A Statistical Observation Model For Noisy Reverberant Speech Features and its Application to Robust ASR","main_file_link":[{"url":"http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6335731","open_access":"1"}],"oa":"1","date_updated":"2022-01-06T06:51:11Z","author":[{"first_name":"Volker","full_name":"Leutnant, Volker","last_name":"Leutnant"},{"last_name":"Krueger","full_name":"Krueger, Alexander","first_name":"Alexander"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_created":"2019-07-12T05:29:44Z"}]
