[{"type":"conference","publication":"ICASSP 2019, Brighton, UK","file":[{"creator":"huesera","date_created":"2019-09-18T08:28:39Z","date_updated":"2019-09-19T07:05:57Z","file_name":"ICASSP_2019_Neumann_Paper.pdf","access_level":"open_access","file_id":"13272","file_size":126453,"content_type":"application/pdf","relation":"main_file"}],"status":"public","abstract":[{"text":"Automatic meeting analysis comprises the tasks of speaker counting, speaker diarization, and the separation of overlapped speech, followed by automatic speech recognition. This all has to be carried out on arbitrarily long sessions and, ideally, in an online or block-online manner. While significant progress has been made on individual tasks, this paper presents for the first time an all-neural approach to simultaneous speaker counting, diarization and source separation. The NN-based estimator operates in a block-online fashion and tracks speakers even if they remain silent for a number of time blocks, thus learning a stable output order for the separated sources. The neural network is recurrent over time as well as over the number of sources. The simulation experiments show that state of the art separation performance is achieved, while at the same time delivering good diarization and source counting results. It even generalizes well to an unseen large number of blocks.","lang":"eng"}],"user_id":"59789","department":[{"_id":"54"}],"_id":"13271","language":[{"iso":"eng"}],"file_date_updated":"2019-09-19T07:05:57Z","ddc":["000"],"has_accepted_license":"1","citation":{"apa":"von Neumann, T., Kinoshita, K., Delcroix, M., Araki, S., Nakatani, T., &#38; Haeb-Umbach, R. (2019). All-neural Online Source Separation, Counting, and Diarization for Meeting Analysis. In <i>ICASSP 2019, Brighton, UK</i>.","bibtex":"@inproceedings{von Neumann_Kinoshita_Delcroix_Araki_Nakatani_Haeb-Umbach_2019, title={All-neural Online Source Separation, Counting, and Diarization for Meeting Analysis}, booktitle={ICASSP 2019, Brighton, UK}, author={von Neumann, Thilo and Kinoshita, Keisuke and Delcroix, Marc and Araki, Shoko and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}, year={2019} }","mla":"von Neumann, Thilo, et al. “All-Neural Online Source Separation, Counting, and Diarization for Meeting Analysis.” <i>ICASSP 2019, Brighton, UK</i>, 2019.","short":"T. von Neumann, K. Kinoshita, M. Delcroix, S. Araki, T. Nakatani, R. Haeb-Umbach, in: ICASSP 2019, Brighton, UK, 2019.","ieee":"T. von Neumann, K. Kinoshita, M. Delcroix, S. Araki, T. Nakatani, and R. Haeb-Umbach, “All-neural Online Source Separation, Counting, and Diarization for Meeting Analysis,” in <i>ICASSP 2019, Brighton, UK</i>, 2019.","chicago":"Neumann, Thilo von, Keisuke Kinoshita, Marc Delcroix, Shoko Araki, Tomohiro Nakatani, and Reinhold Haeb-Umbach. “All-Neural Online Source Separation, Counting, and Diarization for Meeting Analysis.” In <i>ICASSP 2019, Brighton, UK</i>, 2019.","ama":"von Neumann T, Kinoshita K, Delcroix M, Araki S, Nakatani T, Haeb-Umbach R. All-neural Online Source Separation, Counting, and Diarization for Meeting Analysis. In: <i>ICASSP 2019, Brighton, UK</i>. ; 2019."},"year":"2019","date_created":"2019-09-18T08:20:50Z","author":[{"last_name":"von Neumann","full_name":"von Neumann, Thilo","first_name":"Thilo"},{"full_name":"Kinoshita, Keisuke","last_name":"Kinoshita","first_name":"Keisuke"},{"last_name":"Delcroix","full_name":"Delcroix, Marc","first_name":"Marc"},{"full_name":"Araki, Shoko","last_name":"Araki","first_name":"Shoko"},{"full_name":"Nakatani, Tomohiro","last_name":"Nakatani","first_name":"Tomohiro"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"}],"oa":"1","date_updated":"2022-01-06T06:51:31Z","title":"All-neural Online Source Separation, Counting, and Diarization for Meeting Analysis"},{"file":[{"date_created":"2020-02-06T07:28:26Z","creator":"huesera","date_updated":"2020-02-06T07:28:26Z","access_level":"open_access","file_name":"JournalIEEESignal ProcessingMagazine_2019_Haeb-Umbach_Paper.pdf","file_id":"15815","file_size":1085002,"content_type":"application/pdf","relation":"main_file"}],"abstract":[{"lang":"eng","text":"Once a popular theme of futuristic science fiction or far-fetched technology forecasts, digital home assistants with a spoken language interface have become a ubiquitous commodity today. This success has been made possible by major advancements in signal processing and machine learning for so-called far-field speech recognition, where the commands are spoken at a distance from the sound capturing device. The challenges encountered are quite unique and different from many other use cases of automatic speech recognition. The purpose of this tutorial article is to describe, in a way amenable to the non-specialist, the key speech processing algorithms that enable reliable fully hands-free speech interaction with digital home assistants. These technologies include multi-channel acoustic echo cancellation, microphone array processing and dereverberation techniques for signal enhancement, reliable wake-up word and end-of-interaction detection, high-quality speech synthesis, as well as sophisticated statistical models for speech and language, learned from large amounts of heterogeneous training data. In all these fields, deep learning has occupied a critical role."}],"publication":"IEEE Signal Processing Magazine","language":[{"iso":"eng"}],"ddc":["000"],"year":"2019","issue":"6","title":"Speech Processing for Digital Home Assistance: Combining Signal Processing With Deep-Learning Techniques","date_created":"2020-02-06T07:26:20Z","status":"public","type":"journal_article","file_date_updated":"2020-02-06T07:28:26Z","department":[{"_id":"54"}],"user_id":"242","_id":"15814","page":"111-124","intvolume":"        36","citation":{"ama":"Haeb-Umbach R, Watanabe S, Nakatani T, et al. Speech Processing for Digital Home Assistance: Combining Signal Processing With Deep-Learning Techniques. <i>IEEE Signal Processing Magazine</i>. 2019;36(6):111-124. doi:<a href=\"https://doi.org/10.1109/MSP.2019.2918706\">10.1109/MSP.2019.2918706</a>","chicago":"Haeb-Umbach, Reinhold, Shinji Watanabe, Tomohiro Nakatani, Michiel Bacchiani, Bjoern Hoffmeister, Michael L. Seltzer, Heiga Zen, and Mehrez Souden. “Speech Processing for Digital Home Assistance: Combining Signal Processing With Deep-Learning Techniques.” <i>IEEE Signal Processing Magazine</i> 36, no. 6 (2019): 111–24. <a href=\"https://doi.org/10.1109/MSP.2019.2918706\">https://doi.org/10.1109/MSP.2019.2918706</a>.","ieee":"R. Haeb-Umbach <i>et al.</i>, “Speech Processing for Digital Home Assistance: Combining Signal Processing With Deep-Learning Techniques,” <i>IEEE Signal Processing Magazine</i>, vol. 36, no. 6, pp. 111–124, 2019, doi: <a href=\"https://doi.org/10.1109/MSP.2019.2918706\">10.1109/MSP.2019.2918706</a>.","apa":"Haeb-Umbach, R., Watanabe, S., Nakatani, T., Bacchiani, M., Hoffmeister, B., Seltzer, M. L., Zen, H., &#38; Souden, M. (2019). Speech Processing for Digital Home Assistance: Combining Signal Processing With Deep-Learning Techniques. <i>IEEE Signal Processing Magazine</i>, <i>36</i>(6), 111–124. <a href=\"https://doi.org/10.1109/MSP.2019.2918706\">https://doi.org/10.1109/MSP.2019.2918706</a>","short":"R. Haeb-Umbach, S. Watanabe, T. Nakatani, M. Bacchiani, B. Hoffmeister, M.L. Seltzer, H. Zen, M. Souden, IEEE Signal Processing Magazine 36 (2019) 111–124.","mla":"Haeb-Umbach, Reinhold, et al. “Speech Processing for Digital Home Assistance: Combining Signal Processing With Deep-Learning Techniques.” <i>IEEE Signal Processing Magazine</i>, vol. 36, no. 6, 2019, pp. 111–24, doi:<a href=\"https://doi.org/10.1109/MSP.2019.2918706\">10.1109/MSP.2019.2918706</a>.","bibtex":"@article{Haeb-Umbach_Watanabe_Nakatani_Bacchiani_Hoffmeister_Seltzer_Zen_Souden_2019, title={Speech Processing for Digital Home Assistance: Combining Signal Processing With Deep-Learning Techniques}, volume={36}, DOI={<a href=\"https://doi.org/10.1109/MSP.2019.2918706\">10.1109/MSP.2019.2918706</a>}, number={6}, journal={IEEE Signal Processing Magazine}, author={Haeb-Umbach, Reinhold and Watanabe, Shinji and Nakatani, Tomohiro and Bacchiani, Michiel and Hoffmeister, Bjoern and Seltzer, Michael L. and Zen, Heiga and Souden, Mehrez}, year={2019}, pages={111–124} }"},"publication_identifier":{"issn":["1558-0792"]},"has_accepted_license":"1","doi":"10.1109/MSP.2019.2918706","volume":36,"author":[{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"},{"first_name":"Shinji","last_name":"Watanabe","full_name":"Watanabe, Shinji"},{"first_name":"Tomohiro","full_name":"Nakatani, Tomohiro","last_name":"Nakatani"},{"last_name":"Bacchiani","full_name":"Bacchiani, Michiel","first_name":"Michiel"},{"first_name":"Bjoern","full_name":"Hoffmeister, Bjoern","last_name":"Hoffmeister"},{"first_name":"Michael L.","last_name":"Seltzer","full_name":"Seltzer, Michael L."},{"last_name":"Zen","full_name":"Zen, Heiga","first_name":"Heiga"},{"first_name":"Mehrez","full_name":"Souden, Mehrez","last_name":"Souden"}],"oa":"1","date_updated":"2023-01-09T11:47:09Z"},{"file_date_updated":"2020-09-16T08:10:25Z","language":[{"iso":"eng"}],"ddc":["000"],"department":[{"_id":"54"}],"user_id":"59789","_id":"19450","status":"public","file":[{"content_type":"application/pdf","relation":"main_file","date_updated":"2020-09-16T08:10:25Z","creator":"huesera","date_created":"2020-09-16T08:10:25Z","file_size":337622,"access_level":"open_access","file_name":"Artikel_2019_haeb_umbach.pdf","file_id":"19451"}],"abstract":[{"text":"Wenn akustische Signalverarbeitung mit automatisiertem Lernen verknüpft wird: Nachrichtentechniker arbeiten mit mehreren Mikrofonen und tiefen neuronalen Netzen an besserer Spracherkennung unter widrigsten Bedingungen. Von solchen Sensornetzwerken könnten langfristig auch digitale Sprachassistenten profitieren.","lang":"eng"}],"publication":"DFG forschung 1/2019","type":"journal_article","doi":"10.1002/fors.201970104","title":"Lektionen für Alexa & Co?!","author":[{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"date_created":"2020-09-16T08:09:15Z","date_updated":"2023-01-11T11:24:57Z","oa":"1","page":"12-15","citation":{"mla":"Haeb-Umbach, Reinhold. “Lektionen Für Alexa &#38; Co?!” <i>DFG Forschung 1/2019</i>, 2019, pp. 12–15, doi:<a href=\"https://doi.org/10.1002/fors.201970104\">10.1002/fors.201970104</a>.","bibtex":"@article{Haeb-Umbach_2019, title={Lektionen für Alexa &#38; Co?!}, DOI={<a href=\"https://doi.org/10.1002/fors.201970104\">10.1002/fors.201970104</a>}, journal={DFG forschung 1/2019}, author={Haeb-Umbach, Reinhold}, year={2019}, pages={12–15} }","short":"R. Haeb-Umbach, DFG Forschung 1/2019 (2019) 12–15.","apa":"Haeb-Umbach, R. (2019). Lektionen für Alexa &#38; Co?! <i>DFG Forschung 1/2019</i>, 12–15. <a href=\"https://doi.org/10.1002/fors.201970104\">https://doi.org/10.1002/fors.201970104</a>","ama":"Haeb-Umbach R. Lektionen für Alexa &#38; Co?! <i>DFG forschung 1/2019</i>. Published online 2019:12-15. doi:<a href=\"https://doi.org/10.1002/fors.201970104\">10.1002/fors.201970104</a>","ieee":"R. Haeb-Umbach, “Lektionen für Alexa &#38; Co?!,” <i>DFG forschung 1/2019</i>, pp. 12–15, 2019, doi: <a href=\"https://doi.org/10.1002/fors.201970104\">10.1002/fors.201970104</a>.","chicago":"Haeb-Umbach, Reinhold. “Lektionen Für Alexa &#38; Co?!” <i>DFG Forschung 1/2019</i>, 2019, 12–15. <a href=\"https://doi.org/10.1002/fors.201970104\">https://doi.org/10.1002/fors.201970104</a>."},"year":"2019","has_accepted_license":"1"},{"related_material":{"link":[{"description":"Listening examples","relation":"supplementary_material","url":"http://go.upb.de/vcex"}]},"quality_controlled":"1","page":"81-86","citation":{"short":"T. Gburrek, T. Glarner, J. Ebbers, R. Haeb-Umbach, P. Wagner, in: Proc. 10th ISCA Speech Synthesis Workshop, 2019, pp. 81–86.","bibtex":"@inproceedings{Gburrek_Glarner_Ebbers_Haeb-Umbach_Wagner_2019, title={Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion}, DOI={<a href=\"https://doi.org/10.21437/SSW.2019-15\">10.21437/SSW.2019-15</a>}, booktitle={Proc. 10th ISCA Speech Synthesis Workshop}, author={Gburrek, Tobias and Glarner, Thomas and Ebbers, Janek and Haeb-Umbach, Reinhold and Wagner, Petra}, year={2019}, pages={81–86} }","mla":"Gburrek, Tobias, et al. “Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion.” <i>Proc. 10th ISCA Speech Synthesis Workshop</i>, 2019, pp. 81–86, doi:<a href=\"https://doi.org/10.21437/SSW.2019-15\">10.21437/SSW.2019-15</a>.","apa":"Gburrek, T., Glarner, T., Ebbers, J., Haeb-Umbach, R., &#38; Wagner, P. (2019). Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion. <i>Proc. 10th ISCA Speech Synthesis Workshop</i>, 81–86. <a href=\"https://doi.org/10.21437/SSW.2019-15\">https://doi.org/10.21437/SSW.2019-15</a>","ama":"Gburrek T, Glarner T, Ebbers J, Haeb-Umbach R, Wagner P. Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion. In: <i>Proc. 10th ISCA Speech Synthesis Workshop</i>. ; 2019:81-86. doi:<a href=\"https://doi.org/10.21437/SSW.2019-15\">10.21437/SSW.2019-15</a>","chicago":"Gburrek, Tobias, Thomas Glarner, Janek Ebbers, Reinhold Haeb-Umbach, and Petra Wagner. “Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion.” In <i>Proc. 10th ISCA Speech Synthesis Workshop</i>, 81–86, 2019. <a href=\"https://doi.org/10.21437/SSW.2019-15\">https://doi.org/10.21437/SSW.2019-15</a>.","ieee":"T. Gburrek, T. Glarner, J. Ebbers, R. Haeb-Umbach, and P. Wagner, “Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion,” in <i>Proc. 10th ISCA Speech Synthesis Workshop</i>, Vienna, 2019, pp. 81–86, doi: <a href=\"https://doi.org/10.21437/SSW.2019-15\">10.21437/SSW.2019-15</a>."},"year":"2019","date_created":"2019-12-04T08:12:29Z","author":[{"full_name":"Gburrek, Tobias","id":"44006","last_name":"Gburrek","first_name":"Tobias"},{"full_name":"Glarner, Thomas","id":"14169","last_name":"Glarner","first_name":"Thomas"},{"id":"34851","full_name":"Ebbers, Janek","last_name":"Ebbers","first_name":"Janek"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"},{"first_name":"Petra","last_name":"Wagner","full_name":"Wagner, Petra"}],"date_updated":"2023-11-17T06:20:39Z","oa":"1","conference":{"location":"Vienna","name":"10th ISCA Speech Synthesis Workshop"},"doi":"10.21437/SSW.2019-15","main_file_link":[{"url":"https://www.isca-speech.org/archive/pdfs/ssw_2019/gburrek19_ssw.pdf","open_access":"1"}],"title":"Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion","publication":"Proc. 10th ISCA Speech Synthesis Workshop","type":"conference","status":"public","abstract":[{"text":"This  paper  presents  an  approach  to  voice  conversion,  whichdoes neither require parallel data nor speaker or phone labels fortraining.  It can convert between speakers which are not in thetraining set by employing the previously proposed concept of afactorized hierarchical variational autoencoder. Here, linguisticand speaker induced variations are separated upon the notionthat content induced variations change at a much shorter timescale, i.e., at the segment level, than speaker induced variations,which vary at the longer utterance level. In this contribution wepropose to employ convolutional instead of recurrent networklayers  in  the  encoder  and  decoder  blocks,  which  is  shown  toachieve better phone recognition accuracy on the latent segmentvariables at frame-level due to their better temporal resolution.For voice conversion the mean of the utterance variables is re-placed with the respective estimated mean of the target speaker.The resulting log-mel spectra of the decoder output are used aslocal conditions of a WaveNet which is utilized for synthesis ofthe speech waveforms.  Experiments show both good disentan-glement properties of the latent space variables, and good voiceconversion performance.","lang":"eng"}],"department":[{"_id":"54"}],"user_id":"44006","_id":"15237","language":[{"iso":"eng"}]},{"abstract":[{"text":"In this paper we present our audio tagging system for the DCASE 2019 Challenge Task 2. We propose a model consisting of a convolutional front end using log-mel-energies as input features, a recurrent neural network sequence encoder and a fully connected classifier network outputting an activity probability for each of the 80 considered event classes. Due to the recurrent neural network, which encodes a whole sequence into a single vector, our model is able to process sequences of varying lengths. The model is trained with only little manually labeled training data and a larger amount of automatically labeled web data, which hence suffers from label noise. To efficiently train the model with the provided data we use various data augmentation to prevent overfitting and improve generalization. Our best submitted system achieves a label-weighted label-ranking average precision (lwlrap) of 75.5% on the private test set which is an absolute improvement of 21.7% over the baseline. This system scored the second place in the teams ranking of the DCASE 2019 Challenge Task 2 and the fifth place in the Kaggle competition “Freesound Audio Tagging 2019” with more than 400 participants. After the challenge ended we further improved performance to 76.5% lwlrap setting a new state-of-the-art on this dataset.","lang":"eng"}],"file":[{"content_type":"application/pdf","relation":"main_file","date_created":"2020-02-05T10:18:06Z","creator":"huesera","date_updated":"2020-02-05T10:18:06Z","file_name":"DCASE_2019_WS_Ebbers_Paper.pdf","file_id":"15795","access_level":"open_access","file_size":184967}],"status":"public","type":"conference","publication":"DCASE2019 Workshop, New York, USA","ddc":["000"],"language":[{"iso":"eng"}],"file_date_updated":"2020-02-05T10:18:06Z","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"15794","user_id":"34851","department":[{"_id":"54"}],"year":"2019","citation":{"ieee":"J. Ebbers and R. Haeb-Umbach, “Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision,” 2019.","chicago":"Ebbers, Janek, and Reinhold Haeb-Umbach. “Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision.” In <i>DCASE2019 Workshop, New York, USA</i>, 2019.","ama":"Ebbers J, Haeb-Umbach R. Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision. In: <i>DCASE2019 Workshop, New York, USA</i>. ; 2019.","short":"J. Ebbers, R. Haeb-Umbach, in: DCASE2019 Workshop, New York, USA, 2019.","bibtex":"@inproceedings{Ebbers_Haeb-Umbach_2019, title={Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision}, booktitle={DCASE2019 Workshop, New York, USA}, author={Ebbers, Janek and Haeb-Umbach, Reinhold}, year={2019} }","mla":"Ebbers, Janek, and Reinhold Haeb-Umbach. “Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision.” <i>DCASE2019 Workshop, New York, USA</i>, 2019.","apa":"Ebbers, J., &#38; Haeb-Umbach, R. (2019). Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision. <i>DCASE2019 Workshop, New York, USA</i>."},"has_accepted_license":"1","quality_controlled":"1","title":"Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision","oa":"1","date_updated":"2023-11-22T08:30:12Z","author":[{"first_name":"Janek","last_name":"Ebbers","id":"34851","full_name":"Ebbers, Janek"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"date_created":"2020-02-05T10:16:03Z"},{"quality_controlled":"1","has_accepted_license":"1","year":"2019","citation":{"ieee":"J. Ebbers, L. Drude, R. Haeb-Umbach, A. Brendel, and W. Kellermann, “Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks,” 2019.","chicago":"Ebbers, Janek, Lukas Drude, Reinhold Haeb-Umbach, Andreas Brendel, and Walter Kellermann. “Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks.” In <i>CAMSAP 2019, Guadeloupe, West Indies</i>, 2019.","ama":"Ebbers J, Drude L, Haeb-Umbach R, Brendel A, Kellermann W. Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks. In: <i>CAMSAP 2019, Guadeloupe, West Indies</i>. ; 2019.","apa":"Ebbers, J., Drude, L., Haeb-Umbach, R., Brendel, A., &#38; Kellermann, W. (2019). Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks. <i>CAMSAP 2019, Guadeloupe, West Indies</i>.","mla":"Ebbers, Janek, et al. “Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks.” <i>CAMSAP 2019, Guadeloupe, West Indies</i>, 2019.","short":"J. Ebbers, L. Drude, R. Haeb-Umbach, A. Brendel, W. Kellermann, in: CAMSAP 2019, Guadeloupe, West Indies, 2019.","bibtex":"@inproceedings{Ebbers_Drude_Haeb-Umbach_Brendel_Kellermann_2019, title={Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks}, booktitle={CAMSAP 2019, Guadeloupe, West Indies}, author={Ebbers, Janek and Drude, Lukas and Haeb-Umbach, Reinhold and Brendel, Andreas and Kellermann, Walter}, year={2019} }"},"date_updated":"2023-11-22T08:29:58Z","oa":"1","date_created":"2020-02-05T10:20:17Z","author":[{"last_name":"Ebbers","full_name":"Ebbers, Janek","id":"34851","first_name":"Janek"},{"first_name":"Lukas","full_name":"Drude, Lukas","id":"11213","last_name":"Drude"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"},{"last_name":"Brendel","full_name":"Brendel, Andreas","first_name":"Andreas"},{"first_name":"Walter","last_name":"Kellermann","full_name":"Kellermann, Walter"}],"title":"Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks","publication":"CAMSAP 2019, Guadeloupe, West Indies","type":"conference","abstract":[{"lang":"eng","text":"In this paper we consider human daily activity recognition using an acoustic sensor network (ASN) which consists of nodes distributed in a home environment. Assuming that the ASN is permanently recording, the vast majority of recordings is silence. Therefore, we propose to employ a computationally efficient two-stage sound recognition system, consisting of an initial sound activity detection (SAD) and a subsequent sound event classification (SEC), which is only activated once sound activity has been detected. We show how a low-latency activity detector with high temporal resolution can be trained from weak labels with low temporal resolution. We further demonstrate the advantage of using spatial features for the subsequent event classification task."}],"status":"public","file":[{"file_name":"CAMSAP_2019_WS_Ebbers_Paper.pdf","access_level":"open_access","file_id":"15797","file_size":311887,"creator":"huesera","date_created":"2020-02-05T10:21:39Z","date_updated":"2020-02-05T10:21:39Z","relation":"main_file","content_type":"application/pdf"}],"_id":"15796","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"34851","ddc":["000"],"file_date_updated":"2020-02-05T10:21:39Z","language":[{"iso":"eng"}]},{"abstract":[{"lang":"eng","text":"In this paper we highlight the privacy risks entailed in deep neural network feature extraction for domestic activity monitoring. We employ the baseline system proposed in the Task 5 of the DCASE 2018 challenge and simulate a feature interception attack by an eavesdropper who wants to perform speaker identification. We then propose to reduce the aforementioned privacy risks by introducing a variational information feature extraction scheme that allows for good activity monitoring performance while at the same time minimizing the information of the feature representation, thus restricting speaker identification attempts. We analyze the resulting model’s composite loss function and the budget scaling factor used to control the balance between the performance of the trusted and attacker tasks. It is empirically demonstrated that the proposed method reduces speaker identification privacy risks without significantly deprecating the performance of domestic activity monitoring tasks."}],"file":[{"content_type":"application/pdf","relation":"main_file","creator":"huesera","date_created":"2020-02-05T10:11:40Z","date_updated":"2020-02-05T10:11:40Z","access_level":"open_access","file_name":"INTERSPEECH_2019_Ebbers_Paper.pdf","file_id":"15793","file_size":454600}],"status":"public","type":"conference","publication":"INTERSPEECH 2019, Graz, Austria","ddc":["000"],"language":[{"iso":"eng"}],"file_date_updated":"2020-02-05T10:11:40Z","_id":"15792","user_id":"34851","department":[{"_id":"54"}],"year":"2019","citation":{"bibtex":"@inproceedings{Nelus_Ebbers_Haeb-Umbach_Martin_2019, title={Privacy-preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification}, booktitle={INTERSPEECH 2019, Graz, Austria}, author={Nelus, Alexandru and Ebbers, Janek and Haeb-Umbach, Reinhold and Martin, Rainer}, year={2019} }","mla":"Nelus, Alexandru, et al. “Privacy-Preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification.” <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","short":"A. Nelus, J. Ebbers, R. Haeb-Umbach, R. Martin, in: INTERSPEECH 2019, Graz, Austria, 2019.","apa":"Nelus, A., Ebbers, J., Haeb-Umbach, R., &#38; Martin, R. (2019). Privacy-preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification. <i>INTERSPEECH 2019, Graz, Austria</i>.","ama":"Nelus A, Ebbers J, Haeb-Umbach R, Martin R. Privacy-preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification. In: <i>INTERSPEECH 2019, Graz, Austria</i>. ; 2019.","chicago":"Nelus, Alexandru, Janek Ebbers, Reinhold Haeb-Umbach, and Rainer Martin. “Privacy-Preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification.” In <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","ieee":"A. Nelus, J. Ebbers, R. Haeb-Umbach, and R. Martin, “Privacy-preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification,” 2019."},"has_accepted_license":"1","quality_controlled":"1","title":"Privacy-preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification","date_updated":"2023-11-22T08:27:55Z","oa":"1","author":[{"last_name":"Nelus","full_name":"Nelus, Alexandru","first_name":"Alexandru"},{"full_name":"Ebbers, Janek","id":"34851","last_name":"Ebbers","first_name":"Janek"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"},{"first_name":"Rainer","last_name":"Martin","full_name":"Martin, Rainer"}],"date_created":"2020-02-05T10:07:53Z"},{"language":[{"iso":"eng"}],"user_id":"44006","department":[{"_id":"54"}],"_id":"11760","status":"public","abstract":[{"lang":"eng","text":"Acoustic event detection, i.e., the task of assigning a human interpretable label to a segment of audio, has only recently attracted increased interest in the research community. Driven by the DCASE challenges and the availability of large-scale audio datasets, the state-of-the-art has progressed rapidly with deep-learning-based classi- fiers dominating the field. Because several potential use cases favor a realization on distributed sensor nodes, e.g. ambient assisted living applications, habitat monitoring or surveillance, we are concerned with two issues here. Firstly the classification performance of such systems and secondly the computing resources required to achieve a certain performance considering node level feature extraction. In this contribution we look at the balance between the two criteria by employing traditional techniques and different deep learning architectures, including convolutional and recurrent models in the context of real life everyday audio recordings in realistic, however challenging, multisource conditions."}],"type":"conference","publication":"DAGA 2018, München","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2018/Daga_2018_Ebbers_Paper.pdf"}],"title":"Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection","author":[{"first_name":"Janek","id":"34851","full_name":"Ebbers, Janek","last_name":"Ebbers"},{"full_name":"Nelus, Alexandru","last_name":"Nelus","first_name":"Alexandru"},{"first_name":"Rainer","full_name":"Martin, Rainer","last_name":"Martin"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_created":"2019-07-12T05:27:43Z","date_updated":"2022-01-06T06:51:08Z","oa":"1","citation":{"apa":"Ebbers, J., Nelus, A., Martin, R., &#38; Haeb-Umbach, R. (2018). Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection. In <i>DAGA 2018, München</i>.","short":"J. Ebbers, A. Nelus, R. Martin, R. Haeb-Umbach, in: DAGA 2018, München, 2018.","bibtex":"@inproceedings{Ebbers_Nelus_Martin_Haeb-Umbach_2018, title={Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection}, booktitle={DAGA 2018, München}, author={Ebbers, Janek and Nelus, Alexandru and Martin, Rainer and Haeb-Umbach, Reinhold}, year={2018} }","mla":"Ebbers, Janek, et al. “Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection.” <i>DAGA 2018, München</i>, 2018.","chicago":"Ebbers, Janek, Alexandru Nelus, Rainer Martin, and Reinhold Haeb-Umbach. “Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection.” In <i>DAGA 2018, München</i>, 2018.","ieee":"J. Ebbers, A. Nelus, R. Martin, and R. Haeb-Umbach, “Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection,” in <i>DAGA 2018, München</i>, 2018.","ama":"Ebbers J, Nelus A, Martin R, Haeb-Umbach R. Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection. In: <i>DAGA 2018, München</i>. ; 2018."},"year":"2018"},{"oa":"1","date_updated":"2022-01-06T06:51:11Z","date_created":"2019-07-12T05:29:10Z","author":[{"last_name":"Heymann","full_name":"Heymann, Jahn","id":"9168","first_name":"Jahn"},{"id":"11213","full_name":"Drude, Lukas","last_name":"Drude","first_name":"Lukas"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"},{"full_name":"Kinoshita, Keisuke","last_name":"Kinoshita","first_name":"Keisuke"},{"first_name":"Tomohiro","full_name":"Nakatani, Tomohiro","last_name":"Nakatani"}],"title":"Frame-Online DNN-WPE Dereverberation","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/IWAENC_2018_Heymann_Paper.pdf","open_access":"1"}],"related_material":{"link":[{"description":"Poster","relation":"supplementary_material","url":"https://groups.uni-paderborn.de/nt/pubs/2018/IWAENC_2018_Heymann_Poster.pdf"}]},"year":"2018","citation":{"ieee":"J. Heymann, L. Drude, R. Haeb-Umbach, K. Kinoshita, and T. Nakatani, “Frame-Online DNN-WPE Dereverberation,” in <i>IWAENC 2018, Tokio, Japan</i>, 2018.","chicago":"Heymann, Jahn, Lukas Drude, Reinhold Haeb-Umbach, Keisuke Kinoshita, and Tomohiro Nakatani. “Frame-Online DNN-WPE Dereverberation.” In <i>IWAENC 2018, Tokio, Japan</i>, 2018.","ama":"Heymann J, Drude L, Haeb-Umbach R, Kinoshita K, Nakatani T. Frame-Online DNN-WPE Dereverberation. In: <i>IWAENC 2018, Tokio, Japan</i>. ; 2018.","apa":"Heymann, J., Drude, L., Haeb-Umbach, R., Kinoshita, K., &#38; Nakatani, T. (2018). Frame-Online DNN-WPE Dereverberation. In <i>IWAENC 2018, Tokio, Japan</i>.","mla":"Heymann, Jahn, et al. “Frame-Online DNN-WPE Dereverberation.” <i>IWAENC 2018, Tokio, Japan</i>, 2018.","short":"J. Heymann, L. Drude, R. Haeb-Umbach, K. Kinoshita, T. Nakatani, in: IWAENC 2018, Tokio, Japan, 2018.","bibtex":"@inproceedings{Heymann_Drude_Haeb-Umbach_Kinoshita_Nakatani_2018, title={Frame-Online DNN-WPE Dereverberation}, booktitle={IWAENC 2018, Tokio, Japan}, author={Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold and Kinoshita, Keisuke and Nakatani, Tomohiro}, year={2018} }"},"_id":"11835","department":[{"_id":"54"}],"user_id":"44006","language":[{"iso":"eng"}],"publication":"IWAENC 2018, Tokio, Japan","type":"conference","abstract":[{"text":"Signal dereverberation using the weighted prediction error (WPE) method has been proven to be an effective means to raise the accuracy of far-field speech recognition. But in its original formulation, WPE requires multiple iterations over a sufficiently long utterance, rendering it unsuitable for online low-latency applications. Recently, two methods have been proposed to overcome this limitation. One utilizes a neural network to estimate the power spectral density (PSD) of the target signal and works in a block-online fashion. The other method relies on a rather simple PSD estimation which smoothes the observed PSD and utilizes a recursive formulation which enables it to work on a frame-by-frame basis. In this paper, we integrate a deep neural network (DNN) based estimator into the recursive frame-online formulation. We evaluate the performance of the recursive system with different PSD estimators in comparison to the block-online and offline variant on two distinct corpora. The REVERB challenge data, where the signal is mainly deteriorated by reverberation, and a database which combines WSJ and VoiceHome to also consider (directed) noise sources. The results show that although smoothing works surprisingly well, the more sophisticated DNN based estimator shows promising improvements and shortens the performance gap between online and offline processing.","lang":"eng"}],"status":"public"},{"related_material":{"link":[{"relation":"supplementary_material","description":"Slides","url":"https://groups.uni-paderborn.de/nt/pubs/2018/ITG_2018_Heitkaemper_Slides.pdf"}]},"citation":{"ama":"Heitkaemper J, Heymann J, Haeb-Umbach R. Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming. In: <i>ITG 2018, Oldenburg, Germany</i>. ; 2018.","chicago":"Heitkaemper, Jens, Jahn Heymann, and Reinhold Haeb-Umbach. “Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming.” In <i>ITG 2018, Oldenburg, Germany</i>, 2018.","ieee":"J. Heitkaemper, J. Heymann, and R. Haeb-Umbach, “Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming,” in <i>ITG 2018, Oldenburg, Germany</i>, 2018.","apa":"Heitkaemper, J., Heymann, J., &#38; Haeb-Umbach, R. (2018). Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming. In <i>ITG 2018, Oldenburg, Germany</i>.","short":"J. Heitkaemper, J. Heymann, R. Haeb-Umbach, in: ITG 2018, Oldenburg, Germany, 2018.","bibtex":"@inproceedings{Heitkaemper_Heymann_Haeb-Umbach_2018, title={Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming}, booktitle={ITG 2018, Oldenburg, Germany}, author={Heitkaemper, Jens and Heymann, Jahn and Haeb-Umbach, Reinhold}, year={2018} }","mla":"Heitkaemper, Jens, et al. “Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming.” <i>ITG 2018, Oldenburg, Germany</i>, 2018."},"year":"2018","author":[{"id":"27643","full_name":"Heitkaemper, Jens","last_name":"Heitkaemper","first_name":"Jens"},{"last_name":"Heymann","id":"9168","full_name":"Heymann, Jahn","first_name":"Jahn"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"date_created":"2019-07-12T05:29:13Z","date_updated":"2022-01-06T06:51:11Z","oa":"1","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/ITG_2018_Heitkaemper_Paper.pdf","open_access":"1"}],"title":"Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming","publication":"ITG 2018, Oldenburg, Germany","type":"conference","status":"public","abstract":[{"text":"We present a block-online multi-channel front end for automatic speech recognition in noisy and reverberated environments. It is an online version of our earlier proposed neural network supported acoustic beamformer, whose coefficients are calculated from noise and speech spatial covariance matrices which are estimated utilizing a neural mask estimator. However, the sparsity of speech in the STFT domain causes problems for the initial beamformer coefficients estimation in some frequency bins due to lack of speech observations. We propose two methods to mitigate this issue. The first is to lower the frequency resolution of the STFT, which comes with the additional advantage of a reduced time window, thus lowering the latency introduced by block processing. The second approach is to smooth beamforming coefficients along the frequency axis, thus exploiting their high interfrequency correlation. With both approaches the gap between offline and block-online beamformer performance, as measured by the word error rate achieved by a downstream speech recognizer, is significantly reduced. Experiments are carried out on two copora, representing noisy (CHiME-4) and noisy reverberant (voiceHome) environments.","lang":"eng"}],"department":[{"_id":"54"}],"user_id":"44006","_id":"11837","language":[{"iso":"eng"}]},{"citation":{"apa":"Drude, L., Boeddeker, C., Heymann, J., Kinoshita, K., Delcroix, M., Nakatani, T., &#38; Haeb-Umbach, R. (2018). Integration neural network based beamforming and weighted prediction error dereverberation. In <i>INTERSPEECH 2018, Hyderabad, India</i>.","bibtex":"@inproceedings{Drude_Boeddeker_Heymann_Kinoshita_Delcroix_Nakatani_Haeb-Umbach_2018, title={Integration neural network based beamforming and weighted prediction error dereverberation}, booktitle={INTERSPEECH 2018, Hyderabad, India}, author={Drude, Lukas and Boeddeker, Christoph and Heymann, Jahn and Kinoshita, Keisuke and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}, year={2018} }","short":"L. Drude, C. Boeddeker, J. Heymann, K. Kinoshita, M. Delcroix, T. Nakatani, R. Haeb-Umbach, in: INTERSPEECH 2018, Hyderabad, India, 2018.","mla":"Drude, Lukas, et al. “Integration Neural Network Based Beamforming and Weighted Prediction Error Dereverberation.” <i>INTERSPEECH 2018, Hyderabad, India</i>, 2018.","ama":"Drude L, Boeddeker C, Heymann J, et al. Integration neural network based beamforming and weighted prediction error dereverberation. In: <i>INTERSPEECH 2018, Hyderabad, India</i>. ; 2018.","chicago":"Drude, Lukas, Christoph Boeddeker, Jahn Heymann, Keisuke Kinoshita, Marc Delcroix, Tomohiro Nakatani, and Reinhold Haeb-Umbach. “Integration Neural Network Based Beamforming and Weighted Prediction Error Dereverberation.” In <i>INTERSPEECH 2018, Hyderabad, India</i>, 2018.","ieee":"L. Drude <i>et al.</i>, “Integration neural network based beamforming and weighted prediction error dereverberation,” in <i>INTERSPEECH 2018, Hyderabad, India</i>, 2018."},"year":"2018","related_material":{"link":[{"description":"Slides","relation":"supplementary_material","url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Drude_Slides.pdf"}]},"main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Drude_Paper.pdf"}],"title":"Integration neural network based beamforming and weighted prediction error dereverberation","date_created":"2019-07-12T05:29:53Z","author":[{"id":"11213","full_name":"Drude, Lukas","last_name":"Drude","first_name":"Lukas"},{"last_name":"Boeddeker","full_name":"Boeddeker, Christoph","id":"40767","first_name":"Christoph"},{"id":"9168","full_name":"Heymann, Jahn","last_name":"Heymann","first_name":"Jahn"},{"first_name":"Keisuke","last_name":"Kinoshita","full_name":"Kinoshita, Keisuke"},{"last_name":"Delcroix","full_name":"Delcroix, Marc","first_name":"Marc"},{"first_name":"Tomohiro","full_name":"Nakatani, Tomohiro","last_name":"Nakatani"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"date_updated":"2022-01-06T06:51:11Z","oa":"1","status":"public","abstract":[{"text":"The weighted prediction error (WPE) algorithm has proven to be a very successful dereverberation method for the REVERB challenge. Likewise, neural network based mask estimation for beamforming demonstrated very good noise suppression in the CHiME 3 and CHiME 4 challenges. Recently, it has been shown that this estimator can also be trained to perform dereverberation and denoising jointly. However, up to now a comparison of a neural beamformer and WPE is still missing, so is an investigation into a combination of the two. Therefore, we here provide an extensive evaluation of both and consequently propose variants to integrate deep neural network based beamforming with WPE. For these integrated variants we identify a consistent word error rate (WER) reduction on two distinct databases. In particular, our study shows that deep learning based beamforming benefits from a model-based dereverberation technique (i.e. WPE) and vice versa. Our key findings are: (a) Neural beamforming yields the lower WERs in comparison to WPE the more channels and noise are present. (b) Integration of WPE and a neural beamformer consistently outperforms all stand-alone systems.","lang":"eng"}],"publication":"INTERSPEECH 2018, Hyderabad, India","type":"conference","language":[{"iso":"eng"}],"department":[{"_id":"54"}],"user_id":"40767","_id":"11872","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}]},{"year":"2018","citation":{"ama":"Drude L, Heymann J, Boeddeker C, Haeb-Umbach R. NARA-WPE: A Python package for weighted prediction error dereverberation in Numpy and Tensorflow for online and offline processing. In: <i>ITG 2018, Oldenburg, Germany</i>. ; 2018.","ieee":"L. Drude, J. Heymann, C. Boeddeker, and R. Haeb-Umbach, “NARA-WPE: A Python package for weighted prediction error dereverberation in Numpy and Tensorflow for online and offline processing,” in <i>ITG 2018, Oldenburg, Germany</i>, 2018.","chicago":"Drude, Lukas, Jahn Heymann, Christoph Boeddeker, and Reinhold Haeb-Umbach. “NARA-WPE: A Python Package for Weighted Prediction Error Dereverberation in Numpy and Tensorflow for Online and Offline Processing.” In <i>ITG 2018, Oldenburg, Germany</i>, 2018.","mla":"Drude, Lukas, et al. “NARA-WPE: A Python Package for Weighted Prediction Error Dereverberation in Numpy and Tensorflow for Online and Offline Processing.” <i>ITG 2018, Oldenburg, Germany</i>, 2018.","bibtex":"@inproceedings{Drude_Heymann_Boeddeker_Haeb-Umbach_2018, title={NARA-WPE: A Python package for weighted prediction error dereverberation in Numpy and Tensorflow for online and offline processing}, booktitle={ITG 2018, Oldenburg, Germany}, author={Drude, Lukas and Heymann, Jahn and Boeddeker, Christoph and Haeb-Umbach, Reinhold}, year={2018} }","short":"L. Drude, J. Heymann, C. Boeddeker, R. Haeb-Umbach, in: ITG 2018, Oldenburg, Germany, 2018.","apa":"Drude, L., Heymann, J., Boeddeker, C., &#38; Haeb-Umbach, R. (2018). NARA-WPE: A Python package for weighted prediction error dereverberation in Numpy and Tensorflow for online and offline processing. In <i>ITG 2018, Oldenburg, Germany</i>."},"related_material":{"link":[{"relation":"supplementary_material","description":"Poster","url":"https://groups.uni-paderborn.de/nt/pubs/2018/ITG_2018_Drude_Poster.pdf"}]},"title":"NARA-WPE: A Python package for weighted prediction error dereverberation in Numpy and Tensorflow for online and offline processing","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/ITG_2018_Drude_Paper.pdf","open_access":"1"}],"date_updated":"2022-01-06T06:51:11Z","oa":"1","author":[{"first_name":"Lukas","last_name":"Drude","id":"11213","full_name":"Drude, Lukas"},{"last_name":"Heymann","full_name":"Heymann, Jahn","id":"9168","first_name":"Jahn"},{"id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker","first_name":"Christoph"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_created":"2019-07-12T05:29:54Z","abstract":[{"text":"NARA-WPE is a Python software package providing implementations of the weighted prediction error (WPE) dereverberation algorithm. WPE has been shown to be a highly effective tool for speech dereverberation, thus improving the perceptual quality of the signal and improving the recognition performance of downstream automatic speech recognition (ASR). It is suitable both for single-channel and multi-channel applications. The package consist of (1) a Numpy implementation which can easily be integrated into a custom Python toolchain, and (2) a TensorFlow implementation which allows integration into larger computational graphs and enables backpropagation through WPE to train more advanced front-ends. This package comprises of an iterative offline (batch) version, a block-online version, and a frame-online version which can be used in moderately low latency applications, e.g. digital speech assistants.","lang":"eng"}],"status":"public","publication":"ITG 2018, Oldenburg, Germany","type":"conference","language":[{"iso":"eng"}],"_id":"11873","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"40767"},{"citation":{"bibtex":"@article{Despotovic_Walter_Haeb-Umbach_2018, title={Machine learning techniques for semantic analysis of dysarthric speech: An experimental study}, journal={Speech Communication 99 (2018) 242-251 (Elsevier B.V.)}, author={Despotovic, Vladimir and Walter, Oliver and Haeb-Umbach, Reinhold}, year={2018} }","mla":"Despotovic, Vladimir, et al. “Machine Learning Techniques for Semantic Analysis of Dysarthric Speech: An Experimental Study.” <i>Speech Communication 99 (2018) 242-251 (Elsevier B.V.)</i>, 2018.","short":"V. Despotovic, O. Walter, R. Haeb-Umbach, Speech Communication 99 (2018) 242-251 (Elsevier B.V.) (2018).","apa":"Despotovic, V., Walter, O., &#38; Haeb-Umbach, R. (2018). Machine learning techniques for semantic analysis of dysarthric speech: An experimental study. <i>Speech Communication 99 (2018) 242-251 (Elsevier B.V.)</i>.","ama":"Despotovic V, Walter O, Haeb-Umbach R. Machine learning techniques for semantic analysis of dysarthric speech: An experimental study. <i>Speech Communication 99 (2018) 242-251 (Elsevier BV)</i>. 2018.","ieee":"V. Despotovic, O. Walter, and R. Haeb-Umbach, “Machine learning techniques for semantic analysis of dysarthric speech: An experimental study,” <i>Speech Communication 99 (2018) 242-251 (Elsevier B.V.)</i>, 2018.","chicago":"Despotovic, Vladimir, Oliver Walter, and Reinhold Haeb-Umbach. “Machine Learning Techniques for Semantic Analysis of Dysarthric Speech: An Experimental Study.” <i>Speech Communication 99 (2018) 242-251 (Elsevier B.V.)</i>, 2018."},"year":"2018","date_created":"2019-07-12T05:30:44Z","author":[{"full_name":"Despotovic, Vladimir","last_name":"Despotovic","first_name":"Vladimir"},{"first_name":"Oliver","full_name":"Walter, Oliver","last_name":"Walter"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"date_updated":"2022-01-06T06:51:12Z","oa":"1","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/SpeechCommunication_2018_Walter_Paper.pdf","open_access":"1"}],"title":"Machine learning techniques for semantic analysis of dysarthric speech: An experimental study","type":"journal_article","publication":"Speech Communication 99 (2018) 242-251 (Elsevier B.V.)","status":"public","abstract":[{"text":"We present an experimental comparison of seven state-of-the-art machine learning algorithms for the task of semantic analysis of spoken input, with a special emphasis on applications for dysarthric speech. Dysarthria is a motor speech disorder, which is characterized by poor articulation of phonemes. In order to cater for these noncanonical phoneme realizations, we employed an unsupervised learning approach to estimate the acoustic models for speech recognition, which does not require a literal transcription of the training data. Even for the subsequent task of semantic analysis, only weak supervision is employed, whereby the training utterance is accompanied by a semantic label only, rather than a literal transcription. Results on two databases, one of them containing dysarthric speech, are presented showing that Markov logic networks and conditional random fields substantially outperform other machine learning approaches. Markov logic networks have proved to be especially robust to recognition errors, which are caused by imprecise articulation in dysarthric speech.","lang":"eng"}],"user_id":"44006","department":[{"_id":"54"}],"_id":"11916","language":[{"iso":"eng"}]},{"language":[{"iso":"eng"}],"user_id":"44006","department":[{"_id":"54"}],"_id":"12898","status":"public","abstract":[{"text":"Deep clustering (DC) and deep attractor networks (DANs) are a data-driven way to monaural blind source separation. Both approaches provide astonishing single channel performance but have not yet been generalized to block-online processing. When separating speech in a continuous stream with a block-online algorithm, it needs to be determined in each block which of the output streams belongs to whom. In this contribution we solve this block permutation problem by introducing an additional speaker identification embedding to the DAN model structure. We motivate this model decision by analyzing the embedding topology of DC and DANs and show, that DC and DANs themselves are not sufficient for speaker identification. This model structure (a) improves the signal to distortion ratio (SDR) over a DAN baseline and (b) provides up to 61% and up to 34% relative reduction in permutation error rate and re-identification error rate compared to an i-vector baseline, respectively.","lang":"eng"}],"type":"conference","publication":"ICASSP 2018, Calgary, Canada","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/ICASSP_2018_Drude2_Paper.pdf","open_access":"1"}],"title":"Deep Attractor Networks for Speaker Re-Identifikation and Blind Source Separation","date_created":"2019-07-30T14:22:53Z","author":[{"last_name":"Drude","id":"11213","full_name":"Drude, Lukas","first_name":"Lukas"},{"first_name":"Thilo","full_name":"von Neumann, Thilo","last_name":"von Neumann"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_updated":"2022-01-06T06:51:24Z","oa":"1","citation":{"bibtex":"@inproceedings{Drude_von Neumann_Haeb-Umbach_2018, title={Deep Attractor Networks for Speaker Re-Identifikation and Blind Source Separation}, booktitle={ICASSP 2018, Calgary, Canada}, author={Drude, Lukas and von Neumann, Thilo and Haeb-Umbach, Reinhold}, year={2018} }","mla":"Drude, Lukas, et al. “Deep Attractor Networks for Speaker Re-Identifikation and Blind Source Separation.” <i>ICASSP 2018, Calgary, Canada</i>, 2018.","short":"L. Drude, T. von Neumann, R. Haeb-Umbach, in: ICASSP 2018, Calgary, Canada, 2018.","apa":"Drude, L., von Neumann, T., &#38; Haeb-Umbach, R. (2018). Deep Attractor Networks for Speaker Re-Identifikation and Blind Source Separation. In <i>ICASSP 2018, Calgary, Canada</i>.","chicago":"Drude, Lukas, Thilo von Neumann, and Reinhold Haeb-Umbach. “Deep Attractor Networks for Speaker Re-Identifikation and Blind Source Separation.” In <i>ICASSP 2018, Calgary, Canada</i>, 2018.","ieee":"L. Drude, T. von Neumann, and R. Haeb-Umbach, “Deep Attractor Networks for Speaker Re-Identifikation and Blind Source Separation,” in <i>ICASSP 2018, Calgary, Canada</i>, 2018.","ama":"Drude L, von Neumann T, Haeb-Umbach R. Deep Attractor Networks for Speaker Re-Identifikation and Blind Source Separation. In: <i>ICASSP 2018, Calgary, Canada</i>. ; 2018."},"year":"2018","related_material":{"link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/ICASSP_2018_Drude2_Slides.pdf","description":"Slides","relation":"supplementary_material"}]}},{"language":[{"iso":"eng"}],"department":[{"_id":"54"}],"user_id":"44006","_id":"12900","status":"public","abstract":[{"text":"Deep attractor networks (DANs) are a recently introduced method to blindly separate sources from spectral features of a monaural recording using bidirectional long short-term memory networks (BLSTMs). Due to the nature of BLSTMs, this is inherently not online-ready and resorting to operating on blocks yields a block permutation problem in that the index of each speaker may change between blocks. We here propose the joint modeling of spatial and spectral features to solve the block permutation problem and generalize DANs to multi-channel meeting recordings: The DAN acts as a spectral feature extractor for a subsequent model-based clustering approach. We first analyze different joint models in batch-processing scenarios and finally propose a block-online blind source separation algorithm. The efficacy of the proposed models is demonstrated on reverberant mixtures corrupted by real recordings of multi-channel background noise. We demonstrate that both the proposed batch-processing and the proposed block-online system outperform (a) a spatial-only model with a state-of-the-art frequency permutation solver and (b) a spectral-only model with an oracle block permutation solver in terms of signal to distortion ratio (SDR) gains.","lang":"eng"}],"publication":"ICASSP 2018, Calgary, Canada","type":"conference","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/ICASSP_2018_Drude_Paper.pdf","open_access":"1"}],"title":"Dual Frequency- and Block-Permutation Alignment for Deep Learning Based Block-Online Blind Source Separation","author":[{"first_name":"Lukas","last_name":"Drude","full_name":"Drude, Lukas","id":"11213"},{"first_name":" Takuya ","full_name":"Higuchi,,  Takuya ","last_name":"Higuchi,"},{"first_name":"Keisuke ","full_name":"Kinoshita, Keisuke ","last_name":"Kinoshita"},{"first_name":"Tomohiro ","full_name":"Nakatani, Tomohiro ","last_name":"Nakatani"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"date_created":"2019-07-30T14:42:15Z","date_updated":"2022-01-06T06:51:24Z","oa":"1","citation":{"mla":"Drude, Lukas, et al. “Dual Frequency- and Block-Permutation Alignment for Deep Learning Based Block-Online Blind Source Separation.” <i>ICASSP 2018, Calgary, Canada</i>, 2018.","short":"L. Drude,  Takuya  Higuchi, K. Kinoshita, T. Nakatani, R. Haeb-Umbach, in: ICASSP 2018, Calgary, Canada, 2018.","bibtex":"@inproceedings{Drude_Higuchi,_Kinoshita_Nakatani_Haeb-Umbach_2018, title={Dual Frequency- and Block-Permutation Alignment for Deep Learning Based Block-Online Blind Source Separation}, booktitle={ICASSP 2018, Calgary, Canada}, author={Drude, Lukas and Higuchi,  Takuya  and Kinoshita, Keisuke  and Nakatani, Tomohiro  and Haeb-Umbach, Reinhold}, year={2018} }","apa":"Drude, L., Higuchi,  Takuya , Kinoshita, K., Nakatani, T., &#38; Haeb-Umbach, R. (2018). Dual Frequency- and Block-Permutation Alignment for Deep Learning Based Block-Online Blind Source Separation. In <i>ICASSP 2018, Calgary, Canada</i>.","ieee":"L. Drude,  Takuya  Higuchi, K. Kinoshita, T. Nakatani, and R. Haeb-Umbach, “Dual Frequency- and Block-Permutation Alignment for Deep Learning Based Block-Online Blind Source Separation,” in <i>ICASSP 2018, Calgary, Canada</i>, 2018.","chicago":"Drude, Lukas,  Takuya  Higuchi, Keisuke  Kinoshita, Tomohiro  Nakatani, and Reinhold Haeb-Umbach. “Dual Frequency- and Block-Permutation Alignment for Deep Learning Based Block-Online Blind Source Separation.” In <i>ICASSP 2018, Calgary, Canada</i>, 2018.","ama":"Drude L, Higuchi,  Takuya , Kinoshita K, Nakatani T, Haeb-Umbach R. Dual Frequency- and Block-Permutation Alignment for Deep Learning Based Block-Online Blind Source Separation. In: <i>ICASSP 2018, Calgary, Canada</i>. ; 2018."},"year":"2018","related_material":{"link":[{"description":"Poster","relation":"supplementary_material","url":"https://groups.uni-paderborn.de/nt/pubs/2018/ICASSP_2018_Drude_Poster.pdf"}]}},{"title":"Exploring Practical Aspects of Neural Mask-Based Beamforming for Far-Field Speech Recognition","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2018/ICASSP_2018_Boeddeker_Paper.pdf"}],"oa":"1","date_updated":"2022-01-06T06:51:24Z","date_created":"2019-07-30T14:53:58Z","author":[{"first_name":"Christoph","full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker"},{"first_name":"Hakan","full_name":"Erdogan, Hakan","last_name":"Erdogan"},{"first_name":"Takuya","full_name":"Yoshioka, Takuya","last_name":"Yoshioka"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"year":"2018","citation":{"bibtex":"@inproceedings{Boeddeker_Erdogan_Yoshioka_Haeb-Umbach_2018, title={Exploring Practical Aspects of Neural Mask-Based Beamforming for Far-Field Speech Recognition}, booktitle={ICASSP 2018, Calgary, Canada}, author={Boeddeker, Christoph and Erdogan, Hakan and Yoshioka, Takuya and Haeb-Umbach, Reinhold}, year={2018} }","short":"C. Boeddeker, H. Erdogan, T. Yoshioka, R. Haeb-Umbach, in: ICASSP 2018, Calgary, Canada, 2018.","mla":"Boeddeker, Christoph, et al. “Exploring Practical Aspects of Neural Mask-Based Beamforming for Far-Field Speech Recognition.” <i>ICASSP 2018, Calgary, Canada</i>, 2018.","apa":"Boeddeker, C., Erdogan, H., Yoshioka, T., &#38; Haeb-Umbach, R. (2018). Exploring Practical Aspects of Neural Mask-Based Beamforming for Far-Field Speech Recognition. In <i>ICASSP 2018, Calgary, Canada</i>.","ama":"Boeddeker C, Erdogan H, Yoshioka T, Haeb-Umbach R. Exploring Practical Aspects of Neural Mask-Based Beamforming for Far-Field Speech Recognition. In: <i>ICASSP 2018, Calgary, Canada</i>. ; 2018.","ieee":"C. Boeddeker, H. Erdogan, T. Yoshioka, and R. Haeb-Umbach, “Exploring Practical Aspects of Neural Mask-Based Beamforming for Far-Field Speech Recognition,” in <i>ICASSP 2018, Calgary, Canada</i>, 2018.","chicago":"Boeddeker, Christoph, Hakan Erdogan, Takuya Yoshioka, and Reinhold Haeb-Umbach. “Exploring Practical Aspects of Neural Mask-Based Beamforming for Far-Field Speech Recognition.” In <i>ICASSP 2018, Calgary, Canada</i>, 2018."},"related_material":{"link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/ICASSP_2018_Boeddeker_Slides.pdf","relation":"supplementary_material","description":"Poster"}]},"language":[{"iso":"eng"}],"_id":"12901","department":[{"_id":"54"}],"user_id":"44006","abstract":[{"text":"This work examines acoustic beamformers employing neural networks (NNs) for mask prediction as front-end for automatic speech recognition (ASR) systems for practical scenarios like voice-enabled home devices. To test the versatility of the mask predicting network, the system is evaluated with different recording hardware, different microphone array designs, and different acoustic models of the downstream ASR system. Significant gains in recognition accuracy are obtained in all configurations despite the fact that the NN had been trained on mismatched data. Unlike previous work, the NN is trained on a feature level objective, which gives some performance advantage over a mask related criterion. Furthermore, different approaches for realizing online, or adaptive, NN-based beamforming are explored, where the online algorithms still show significant gains compared to the baseline performance.","lang":"eng"}],"status":"public","publication":"ICASSP 2018, Calgary, Canada","type":"conference"},{"department":[{"_id":"54"}],"user_id":"460","_id":"12899","project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"language":[{"iso":"eng"}],"publication":"Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India","type":"conference","status":"public","abstract":[{"text":"This contribution presents a speech enhancement system for the CHiME-5 Dinner Party Scenario. The front-end employs multi-channel linear time-variant filtering and achieves its gains without the use of a neural network. We present an adaptation of blind source separation techniques to the CHiME-5 database which we call Guided Source Separation (GSS). Using the baseline acoustic and language model, the combination of Weighted Prediction Error based dereverberation, guided source separation, and beamforming reduces the WER by 10:54% (relative) for the single array track and by 21:12% (relative) on the multiple array track.","lang":"eng"}],"author":[{"full_name":"Boeddeker, Christoph","id":"40767","last_name":"Boeddeker","first_name":"Christoph"},{"id":"27643","full_name":"Heitkaemper, Jens","last_name":"Heitkaemper","first_name":"Jens"},{"first_name":"Joerg","last_name":"Schmalenstroeer","full_name":"Schmalenstroeer, Joerg","id":"460"},{"last_name":"Drude","full_name":"Drude, Lukas","id":"11213","first_name":"Lukas"},{"last_name":"Heymann","full_name":"Heymann, Jahn","first_name":"Jahn"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"date_created":"2019-07-30T14:35:15Z","oa":"1","date_updated":"2023-10-26T08:14:15Z","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Heitkaemper_Paper.pdf"}],"title":"Front-End Processing for the CHiME-5 Dinner Party Scenario","related_material":{"link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Heitkaemper_Poster.pdf","relation":"supplementary_material","description":"Poster"}]},"quality_controlled":"1","citation":{"bibtex":"@inproceedings{Boeddeker_Heitkaemper_Schmalenstroeer_Drude_Heymann_Haeb-Umbach_2018, title={Front-End Processing for the CHiME-5 Dinner Party Scenario}, booktitle={Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India}, author={Boeddeker, Christoph and Heitkaemper, Jens and Schmalenstroeer, Joerg and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}, year={2018} }","short":"C. Boeddeker, J. Heitkaemper, J. Schmalenstroeer, L. Drude, J. Heymann, R. Haeb-Umbach, in: Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India, 2018.","mla":"Boeddeker, Christoph, et al. “Front-End Processing for the CHiME-5 Dinner Party Scenario.” <i>Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India</i>, 2018.","apa":"Boeddeker, C., Heitkaemper, J., Schmalenstroeer, J., Drude, L., Heymann, J., &#38; Haeb-Umbach, R. (2018). Front-End Processing for the CHiME-5 Dinner Party Scenario. <i>Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India</i>.","ieee":"C. Boeddeker, J. Heitkaemper, J. Schmalenstroeer, L. Drude, J. Heymann, and R. Haeb-Umbach, “Front-End Processing for the CHiME-5 Dinner Party Scenario,” 2018.","chicago":"Boeddeker, Christoph, Jens Heitkaemper, Joerg Schmalenstroeer, Lukas Drude, Jahn Heymann, and Reinhold Haeb-Umbach. “Front-End Processing for the CHiME-5 Dinner Party Scenario.” In <i>Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India</i>, 2018.","ama":"Boeddeker C, Heitkaemper J, Schmalenstroeer J, Drude L, Heymann J, Haeb-Umbach R. Front-End Processing for the CHiME-5 Dinner Party Scenario. In: <i>Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India</i>. ; 2018."},"year":"2018"},{"_id":"6859","project":[{"name":"Akustische Sensornetzwerke - Teilprojekt ","_id":"27"},{"name":"Akustische Sensornetzwerke - Teilprojekt \"Verteilte akustische Signalverarbeitung über funkbasierte Sensornetzwerke","_id":"27"}],"department":[{"_id":"75"},{"_id":"54"}],"user_id":"460","language":[{"iso":"eng"}],"publication":"Speech Communication; 13th ITG-Symposium","type":"conference","abstract":[{"text":"Signal processing in WASNs is based on a software framework for hosting the algorithms as well as on a set of wireless connected devices representing the hardware. Each of the nodes contributes memory, processing power, communication bandwidth and some sensor information for the tasks to be solved on the network. \r\nIn this paper we present our MARVELO framework for distributed signal processing. It is intended for transforming existing centralized implementations into distributed versions. To this end, the software only needs a block-oriented implementation, which MARVELO picks-up and distributes on the network. Additionally, our sensor node hardware and the audio interfaces responsible for multi-channel recordings are presented.","lang":"eng"}],"status":"public","date_updated":"2023-10-26T08:15:32Z","date_created":"2019-01-17T15:47:35Z","author":[{"first_name":"Haitham","id":"65718","full_name":"Afifi, Haitham","last_name":"Afifi"},{"first_name":"Joerg","last_name":"Schmalenstroeer","full_name":"Schmalenstroeer, Joerg","id":"460"},{"first_name":"Joerg","id":"16256","full_name":"Ullmann, Joerg","last_name":"Ullmann"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"},{"first_name":"Holger","last_name":"Karl","id":"126","full_name":"Karl, Holger"}],"title":"MARVELO - A Framework for Signal Processing in Wireless Acoustic Sensor Networks","quality_controlled":"1","year":"2018","page":"1-5","citation":{"apa":"Afifi, H., Schmalenstroeer, J., Ullmann, J., Haeb-Umbach, R., &#38; Karl, H. (2018). MARVELO - A Framework for Signal Processing in Wireless Acoustic Sensor Networks. <i>Speech Communication; 13th ITG-Symposium</i>, 1–5.","bibtex":"@inproceedings{Afifi_Schmalenstroeer_Ullmann_Haeb-Umbach_Karl_2018, title={MARVELO - A Framework for Signal Processing in Wireless Acoustic Sensor Networks}, booktitle={Speech Communication; 13th ITG-Symposium}, author={Afifi, Haitham and Schmalenstroeer, Joerg and Ullmann, Joerg and Haeb-Umbach, Reinhold and Karl, Holger}, year={2018}, pages={1–5} }","mla":"Afifi, Haitham, et al. “MARVELO - A Framework for Signal Processing in Wireless Acoustic Sensor Networks.” <i>Speech Communication; 13th ITG-Symposium</i>, 2018, pp. 1–5.","short":"H. Afifi, J. Schmalenstroeer, J. Ullmann, R. Haeb-Umbach, H. Karl, in: Speech Communication; 13th ITG-Symposium, 2018, pp. 1–5.","ieee":"H. Afifi, J. Schmalenstroeer, J. Ullmann, R. Haeb-Umbach, and H. Karl, “MARVELO - A Framework for Signal Processing in Wireless Acoustic Sensor Networks,” in <i>Speech Communication; 13th ITG-Symposium</i>, 2018, pp. 1–5.","chicago":"Afifi, Haitham, Joerg Schmalenstroeer, Joerg Ullmann, Reinhold Haeb-Umbach, and Holger Karl. “MARVELO - A Framework for Signal Processing in Wireless Acoustic Sensor Networks.” In <i>Speech Communication; 13th ITG-Symposium</i>, 1–5, 2018.","ama":"Afifi H, Schmalenstroeer J, Ullmann J, Haeb-Umbach R, Karl H. MARVELO - A Framework for Signal Processing in Wireless Acoustic Sensor Networks. In: <i>Speech Communication; 13th ITG-Symposium</i>. ; 2018:1-5."}},{"citation":{"apa":"Grimm, C., Breddermann, T., Farhoud, R., Fei, T., Warsitz, E., &#38; Haeb-Umbach, R. (2018). Discrimination of Stationary from Moving Targets with Recurrent Neural Networks in Automotive Radar. <i>International Conference on Microwaves for Intelligent Mobility (ICMIM) 2018</i>.","mla":"Grimm, Christopher, et al. “Discrimination of Stationary from Moving Targets with Recurrent Neural Networks in Automotive Radar.” <i>International Conference on Microwaves for Intelligent Mobility (ICMIM) 2018</i>, 2018.","short":"C. Grimm, T. Breddermann, R. Farhoud, T. Fei, E. Warsitz, R. Haeb-Umbach, in: International Conference on Microwaves for Intelligent Mobility (ICMIM) 2018, 2018.","bibtex":"@inproceedings{Grimm_Breddermann_Farhoud_Fei_Warsitz_Haeb-Umbach_2018, title={Discrimination of Stationary from Moving Targets with Recurrent Neural Networks in Automotive Radar}, booktitle={International Conference on Microwaves for Intelligent Mobility (ICMIM) 2018}, author={Grimm, Christopher and Breddermann, Tobias and Farhoud, Ridha and Fei, Tai and Warsitz, Ernst and Haeb-Umbach, Reinhold}, year={2018} }","chicago":"Grimm, Christopher, Tobias Breddermann, Ridha Farhoud, Tai Fei, Ernst Warsitz, and Reinhold Haeb-Umbach. “Discrimination of Stationary from Moving Targets with Recurrent Neural Networks in Automotive Radar.” In <i>International Conference on Microwaves for Intelligent Mobility (ICMIM) 2018</i>, 2018.","ieee":"C. Grimm, T. Breddermann, R. Farhoud, T. Fei, E. Warsitz, and R. Haeb-Umbach, “Discrimination of Stationary from Moving Targets with Recurrent Neural Networks in Automotive Radar,” 2018.","ama":"Grimm C, Breddermann T, Farhoud R, Fei T, Warsitz E, Haeb-Umbach R. Discrimination of Stationary from Moving Targets with Recurrent Neural Networks in Automotive Radar. In: <i>International Conference on Microwaves for Intelligent Mobility (ICMIM) 2018</i>. ; 2018."},"year":"2018","quality_controlled":"1","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2018/ICMIM_2018_Haeb-Umbach_Paper.pdf"}],"title":"Discrimination of Stationary from Moving Targets with Recurrent Neural Networks in Automotive Radar","author":[{"first_name":"Christopher","full_name":"Grimm, Christopher","last_name":"Grimm"},{"first_name":"Tobias","full_name":"Breddermann, Tobias","last_name":"Breddermann"},{"last_name":"Farhoud","full_name":"Farhoud, Ridha","first_name":"Ridha"},{"last_name":"Fei","full_name":"Fei, Tai","first_name":"Tai"},{"first_name":"Ernst","full_name":"Warsitz, Ernst","last_name":"Warsitz"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"date_created":"2019-07-12T05:27:29Z","date_updated":"2023-11-20T16:37:39Z","oa":"1","status":"public","abstract":[{"lang":"eng","text":"In this paper, we present a neural network based classification algorithm for the discrimination of moving from stationary targets in the sight of an automotive radar sensor. Compared to existing algorithms, the proposed algorithm can take into account multiple local radar targets instead of performing classification inference on each target individually resulting in superior discrimination accuracy, especially suitable for non rigid objects, like pedestrians, which in general have a wide velocity spread when multiple targets are detected."}],"type":"conference","publication":"International Conference on Microwaves for Intelligent Mobility (ICMIM) 2018","language":[{"iso":"eng"}],"user_id":"242","department":[{"_id":"54"}],"_id":"11747"},{"main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Glarner_Paper.pdf","open_access":"1"}],"title":"Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery","date_created":"2019-07-12T05:30:34Z","author":[{"first_name":"Thomas","id":"14169","full_name":"Glarner, Thomas","last_name":"Glarner"},{"full_name":"Hanebrink, Patrick","last_name":"Hanebrink","first_name":"Patrick"},{"first_name":"Janek","last_name":"Ebbers","full_name":"Ebbers, Janek","id":"34851"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"date_updated":"2023-11-22T08:29:22Z","oa":"1","citation":{"ama":"Glarner T, Hanebrink P, Ebbers J, Haeb-Umbach R. Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery. In: <i>INTERSPEECH 2018, Hyderabad, India</i>. ; 2018.","ieee":"T. Glarner, P. Hanebrink, J. Ebbers, and R. Haeb-Umbach, “Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery,” 2018.","chicago":"Glarner, Thomas, Patrick Hanebrink, Janek Ebbers, and Reinhold Haeb-Umbach. “Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery.” In <i>INTERSPEECH 2018, Hyderabad, India</i>, 2018.","apa":"Glarner, T., Hanebrink, P., Ebbers, J., &#38; Haeb-Umbach, R. (2018). Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery. <i>INTERSPEECH 2018, Hyderabad, India</i>.","mla":"Glarner, Thomas, et al. “Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery.” <i>INTERSPEECH 2018, Hyderabad, India</i>, 2018.","short":"T. Glarner, P. Hanebrink, J. Ebbers, R. Haeb-Umbach, in: INTERSPEECH 2018, Hyderabad, India, 2018.","bibtex":"@inproceedings{Glarner_Hanebrink_Ebbers_Haeb-Umbach_2018, title={Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery}, booktitle={INTERSPEECH 2018, Hyderabad, India}, author={Glarner, Thomas and Hanebrink, Patrick and Ebbers, Janek and Haeb-Umbach, Reinhold}, year={2018} }"},"year":"2018","related_material":{"link":[{"relation":"supplementary_material","description":"Slides","url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Glarner_Slides.pdf"}]},"quality_controlled":"1","language":[{"iso":"eng"}],"user_id":"34851","department":[{"_id":"54"}],"_id":"11907","status":"public","abstract":[{"text":"The invention of the Variational Autoencoder enables the application of Neural Networks to a wide range of tasks in unsupervised learning, including the field of Acoustic Unit Discovery (AUD). The recently proposed Hidden Markov Model Variational Autoencoder (HMMVAE) allows a joint training of a neural network based feature extractor and a structured prior for the latent space given by a Hidden Markov Model. It has been shown that the HMMVAE significantly outperforms pure GMM-HMM based systems on the AUD task. However, the HMMVAE cannot autonomously infer the number of acoustic units and thus relies on the GMM-HMM system for initialization. This paper introduces the Bayesian Hidden Markov Model Variational Autoencoder (BHMMVAE) which solves these issues by embedding the HMMVAE in a Bayesian framework with a Dirichlet Process Prior for the distribution of the acoustic units, and diagonal or full-covariance Gaussians as emission distributions. Experiments on TIMIT and Xitsonga show that the BHMMVAE is able to autonomously infer a reasonable number of acoustic units, can be initialized without supervision by a GMM-HMM system, achieves computationally efficient stochastic variational inference by using natural gradient descent, and, additionally, improves the AUD performance over the HMMVAE.","lang":"eng"}],"type":"conference","publication":"INTERSPEECH 2018, Hyderabad, India"}]
