[{"project":[{"name":"TRR 318 - C06: TRR 318 - Technisch unterstütztes Erklären von Stimmcharakteristika (Teilprojekt C06)","_id":"129","grant_number":"438445824"}],"_id":"44849","user_id":"72602","department":[{"_id":"54"},{"_id":"660"}],"ddc":["000"],"language":[{"iso":"eng"}],"file_date_updated":"2024-02-29T16:15:12Z","type":"conference","publication":"Fortschritte der Akustik - DAGA 2023","file":[{"content_type":"application/pdf","relation":"main_file","date_updated":"2024-02-29T16:15:12Z","creator":"frra","date_created":"2024-02-29T16:15:12Z","file_size":289493,"file_id":"52221","access_level":"open_access","file_name":"Daga_2023_Rautenberg_Paper.pdf"}],"status":"public","date_updated":"2024-02-29T17:05:16Z","oa":"1","date_created":"2023-05-15T08:48:54Z","author":[{"id":"72602","full_name":"Rautenberg, Frederik","last_name":"Rautenberg","first_name":"Frederik"},{"first_name":"Michael","last_name":"Kuhlmann","id":"49871","full_name":"Kuhlmann, Michael"},{"first_name":"Janek","last_name":"Ebbers","full_name":"Ebbers, Janek","id":"34851"},{"first_name":"Jana","full_name":"Wiechmann, Jana","last_name":"Wiechmann"},{"first_name":"Fritz","full_name":"Seebauer, Fritz","last_name":"Seebauer"},{"last_name":"Wagner","full_name":"Wagner, Petra","first_name":"Petra"},{"full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"title":"Speech Disentanglement for Analysis and Modification of Acoustic and Perceptual Speaker Characteristics","main_file_link":[{"open_access":"1","url":"https://pub.dega-akustik.de/DAGA_2023/data/articles/000105.pdf"}],"conference":{"start_date":"2023-03-06","name":"DAGA 2023 - 49. Jahrestagung für Akustik","location":"Hamburg","end_date":"2023-03-09"},"publication_status":"published","has_accepted_license":"1","year":"2023","citation":{"bibtex":"@inproceedings{Rautenberg_Kuhlmann_Ebbers_Wiechmann_Seebauer_Wagner_Haeb-Umbach_2023, title={Speech Disentanglement for Analysis and Modification of Acoustic and Perceptual Speaker Characteristics}, booktitle={Fortschritte der Akustik - DAGA 2023}, author={Rautenberg, Frederik and Kuhlmann, Michael and Ebbers, Janek and Wiechmann, Jana and Seebauer, Fritz and Wagner, Petra and Haeb-Umbach, Reinhold}, year={2023}, pages={1409–1412} }","mla":"Rautenberg, Frederik, et al. “Speech Disentanglement for Analysis and Modification of Acoustic and Perceptual Speaker Characteristics.” <i>Fortschritte Der Akustik - DAGA 2023</i>, 2023, pp. 1409–12.","short":"F. Rautenberg, M. Kuhlmann, J. Ebbers, J. Wiechmann, F. Seebauer, P. Wagner, R. Haeb-Umbach, in: Fortschritte Der Akustik - DAGA 2023, 2023, pp. 1409–1412.","apa":"Rautenberg, F., Kuhlmann, M., Ebbers, J., Wiechmann, J., Seebauer, F., Wagner, P., &#38; Haeb-Umbach, R. (2023). Speech Disentanglement for Analysis and Modification of Acoustic and Perceptual Speaker Characteristics. <i>Fortschritte Der Akustik - DAGA 2023</i>, 1409–1412.","ieee":"F. Rautenberg <i>et al.</i>, “Speech Disentanglement for Analysis and Modification of Acoustic and Perceptual Speaker Characteristics,” in <i>Fortschritte der Akustik - DAGA 2023</i>, Hamburg, 2023, pp. 1409–1412.","chicago":"Rautenberg, Frederik, Michael Kuhlmann, Janek Ebbers, Jana Wiechmann, Fritz Seebauer, Petra Wagner, and Reinhold Haeb-Umbach. “Speech Disentanglement for Analysis and Modification of Acoustic and Perceptual Speaker Characteristics.” In <i>Fortschritte Der Akustik - DAGA 2023</i>, 1409–12, 2023.","ama":"Rautenberg F, Kuhlmann M, Ebbers J, et al. Speech Disentanglement for Analysis and Modification of Acoustic and Perceptual Speaker Characteristics. In: <i>Fortschritte Der Akustik - DAGA 2023</i>. ; 2023:1409-1412."},"page":"1409-1412"},{"publication":"Proceedings of the 8th Detection and Classification of Acoustic Scenes and Events 2023 Workshop (DCASE2023)","type":"conference","abstract":[{"lang":"eng","text":"Due to the high variation in the application requirements of sound event detection (SED) systems, it is not sufficient to evaluate systems only in a single operating mode. Therefore, the community recently adopted the polyphonic sound detection score (PSDS) as an evaluation metric, which is the normalized area under the PSD receiver operating characteristic (PSD-ROC). It summarizes the system performance over a range of operating modes resulting from varying the decision threshold that is used to translate the system output scores into a binary detection output. Hence, it provides a more complete picture of the overall system behavior and is less biased by specific threshold tuning. However, besides the decision threshold there is also the post-processing that can be changed to enter another operating mode. In this paper we propose the post-processing independent PSDS (piPSDS) as a generalization of the PSDS. Here, the post-processing independent PSD-ROC includes operating points from varying post-processings with varying decision thresholds. Thus, it summarizes even more operating modes of an SED system and allows for system comparison without the need of implementing a post-processing and without a bias due to different post-processings. While piPSDS can in principle combine different types of post-processing, we here, as a first step, present median filter independent PSDS (miPSDS) results for this year’s DCASE Challenge Task4a systems. Source code is publicly available in our sed_scores_eval package (https://github.com/fgnt/sed_scores_eval)."}],"status":"public","file":[{"file_size":221875,"access_level":"closed","file_id":"49112","file_name":"dcase2023_ebbers.pdf","date_updated":"2023-11-22T08:25:08Z","creator":"ebbers","date_created":"2023-11-22T08:25:08Z","success":1,"relation":"main_file","content_type":"application/pdf"}],"_id":"49111","project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"department":[{"_id":"54"}],"user_id":"34851","ddc":["000"],"language":[{"iso":"eng"}],"file_date_updated":"2023-11-22T08:25:08Z","quality_controlled":"1","has_accepted_license":"1","year":"2023","place":"Tampere, Finland","page":"36–40","citation":{"chicago":"Ebbers, Janek, Reinhold Haeb-Umbach, and Romain Serizel. “Post-Processing Independent Evaluation of Sound Event Detection Systems.” In <i>Proceedings of the 8th Detection and Classification of Acoustic Scenes and Events 2023 Workshop (DCASE2023)</i>, 36–40. Tampere, Finland, 2023.","ieee":"J. Ebbers, R. Haeb-Umbach, and R. Serizel, “Post-Processing Independent Evaluation of Sound Event Detection Systems,” in <i>Proceedings of the 8th Detection and Classification of Acoustic Scenes and Events 2023 Workshop (DCASE2023)</i>, 2023, pp. 36–40.","ama":"Ebbers J, Haeb-Umbach R, Serizel R. Post-Processing Independent Evaluation of Sound Event Detection Systems. In: <i>Proceedings of the 8th Detection and Classification of Acoustic Scenes and Events 2023 Workshop (DCASE2023)</i>. ; 2023:36–40.","short":"J. Ebbers, R. Haeb-Umbach, R. Serizel, in: Proceedings of the 8th Detection and Classification of Acoustic Scenes and Events 2023 Workshop (DCASE2023), Tampere, Finland, 2023, pp. 36–40.","bibtex":"@inproceedings{Ebbers_Haeb-Umbach_Serizel_2023, place={Tampere, Finland}, title={Post-Processing Independent Evaluation of Sound Event Detection Systems}, booktitle={Proceedings of the 8th Detection and Classification of Acoustic Scenes and Events 2023 Workshop (DCASE2023)}, author={Ebbers, Janek and Haeb-Umbach, Reinhold and Serizel, Romain}, year={2023}, pages={36–40} }","mla":"Ebbers, Janek, et al. “Post-Processing Independent Evaluation of Sound Event Detection Systems.” <i>Proceedings of the 8th Detection and Classification of Acoustic Scenes and Events 2023 Workshop (DCASE2023)</i>, 2023, pp. 36–40.","apa":"Ebbers, J., Haeb-Umbach, R., &#38; Serizel, R. (2023). Post-Processing Independent Evaluation of Sound Event Detection Systems. <i>Proceedings of the 8th Detection and Classification of Acoustic Scenes and Events 2023 Workshop (DCASE2023)</i>, 36–40."},"date_updated":"2024-11-15T20:34:18Z","author":[{"last_name":"Ebbers","id":"34851","full_name":"Ebbers, Janek","first_name":"Janek"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"},{"last_name":"Serizel","full_name":"Serizel, Romain","first_name":"Romain"}],"date_created":"2023-11-22T08:20:26Z","title":"Post-Processing Independent Evaluation of Sound Event Detection Systems"},{"oa":"1","date_updated":"2023-10-25T09:04:45Z","author":[{"id":"49871","full_name":"Kuhlmann, Michael","last_name":"Kuhlmann","first_name":"Michael"},{"first_name":"Fritz","full_name":"Seebauer, Fritz","last_name":"Seebauer"},{"id":"34851","full_name":"Ebbers, Janek","last_name":"Ebbers","first_name":"Janek"},{"full_name":"Wagner, Petra","last_name":"Wagner","first_name":"Petra"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"}],"main_file_link":[{"url":"https://www.isca-speech.org/archive/pdfs/interspeech_2022/kuhlmann22_interspeech.pdf","open_access":"1"}],"doi":"10.21437/interspeech.2022-10740","publication_status":"published","has_accepted_license":"1","citation":{"ama":"Kuhlmann M, Seebauer F, Ebbers J, Wagner P, Haeb-Umbach R. Investigation into Target Speaking Rate Adaptation for Voice Conversion. In: <i>Interspeech 2022</i>. ISCA; 2022. doi:<a href=\"https://doi.org/10.21437/interspeech.2022-10740\">10.21437/interspeech.2022-10740</a>","ieee":"M. Kuhlmann, F. Seebauer, J. Ebbers, P. Wagner, and R. Haeb-Umbach, “Investigation into Target Speaking Rate Adaptation for Voice Conversion,” 2022, doi: <a href=\"https://doi.org/10.21437/interspeech.2022-10740\">10.21437/interspeech.2022-10740</a>.","chicago":"Kuhlmann, Michael, Fritz Seebauer, Janek Ebbers, Petra Wagner, and Reinhold Haeb-Umbach. “Investigation into Target Speaking Rate Adaptation for Voice Conversion.” In <i>Interspeech 2022</i>. ISCA, 2022. <a href=\"https://doi.org/10.21437/interspeech.2022-10740\">https://doi.org/10.21437/interspeech.2022-10740</a>.","apa":"Kuhlmann, M., Seebauer, F., Ebbers, J., Wagner, P., &#38; Haeb-Umbach, R. (2022). Investigation into Target Speaking Rate Adaptation for Voice Conversion. <i>Interspeech 2022</i>. <a href=\"https://doi.org/10.21437/interspeech.2022-10740\">https://doi.org/10.21437/interspeech.2022-10740</a>","short":"M. Kuhlmann, F. Seebauer, J. Ebbers, P. Wagner, R. Haeb-Umbach, in: Interspeech 2022, ISCA, 2022.","bibtex":"@inproceedings{Kuhlmann_Seebauer_Ebbers_Wagner_Haeb-Umbach_2022, title={Investigation into Target Speaking Rate Adaptation for Voice Conversion}, DOI={<a href=\"https://doi.org/10.21437/interspeech.2022-10740\">10.21437/interspeech.2022-10740</a>}, booktitle={Interspeech 2022}, publisher={ISCA}, author={Kuhlmann, Michael and Seebauer, Fritz and Ebbers, Janek and Wagner, Petra and Haeb-Umbach, Reinhold}, year={2022} }","mla":"Kuhlmann, Michael, et al. “Investigation into Target Speaking Rate Adaptation for Voice Conversion.” <i>Interspeech 2022</i>, ISCA, 2022, doi:<a href=\"https://doi.org/10.21437/interspeech.2022-10740\">10.21437/interspeech.2022-10740</a>."},"project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"33857","user_id":"34851","department":[{"_id":"54"}],"file_date_updated":"2023-07-15T16:16:12Z","type":"conference","status":"public","publisher":"ISCA","date_created":"2022-10-21T06:50:59Z","title":"Investigation into Target Speaking Rate Adaptation for Voice Conversion","quality_controlled":"1","year":"2022","ddc":["000"],"language":[{"iso":"eng"}],"publication":"Interspeech 2022","file":[{"date_created":"2023-07-15T16:16:12Z","creator":"mikuhl","date_updated":"2023-07-15T16:16:12Z","file_name":"kuhlmann22_interspeech.pdf","access_level":"closed","file_id":"46070","file_size":303863,"content_type":"application/pdf","relation":"main_file","success":1}]},{"publication":"Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","type":"conference","abstract":[{"text":"Performing an adequate evaluation of sound event detection (SED) systems is far from trivial and is still subject to ongoing research. The recently proposed polyphonic sound detection (PSD)-receiver operating characteristic (ROC) and PSD score (PSDS) make an important step into the direction of an evaluation of SED systems which is independent from a certain decision threshold. This allows to obtain a more complete picture of the overall system behavior which is less biased by threshold tuning. Yet, the PSD-ROC is currently only approximated using a finite set of thresholds. The choice of\r\nthe thresholds used in approximation, however, can have a severe impact on the resulting PSDS. In this paper we propose a method which allows for computing system performance on an evaluation set for all possible thresholds jointly, enabling accurate computation not only of the PSD-ROC and PSDS but also of other collar-based\r\nand intersection-based performance curves. It further allows to select the threshold which best fulfills the requirements of a given application. Source code is publicly available in our SED evaluation package sed_scores_eval.","lang":"eng"}],"status":"public","file":[{"content_type":"application/pdf","relation":"main_file","date_created":"2022-11-14T12:19:55Z","creator":"ebbers","date_updated":"2022-11-14T12:19:55Z","access_level":"open_access","file_name":"Template.pdf","file_id":"34073","file_size":214001}],"_id":"34072","department":[{"_id":"54"}],"user_id":"34851","ddc":["000"],"language":[{"iso":"eng"}],"file_date_updated":"2022-11-14T12:19:55Z","quality_controlled":"1","has_accepted_license":"1","year":"2022","citation":{"apa":"Ebbers, J., Haeb-Umbach, R., &#38; Serizel, R. (2022). Threshold Independent Evaluation of Sound Event Detection Scores. <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>.","short":"J. Ebbers, R. Haeb-Umbach, R. Serizel, in: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2022.","bibtex":"@inproceedings{Ebbers_Haeb-Umbach_Serizel_2022, title={Threshold Independent Evaluation of Sound Event Detection Scores}, booktitle={Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={Ebbers, Janek and Haeb-Umbach, Reinhold and Serizel, Romain}, year={2022} }","mla":"Ebbers, Janek, et al. “Threshold Independent Evaluation of Sound Event Detection Scores.” <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2022.","ama":"Ebbers J, Haeb-Umbach R, Serizel R. Threshold Independent Evaluation of Sound Event Detection Scores. In: <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. ; 2022.","chicago":"Ebbers, Janek, Reinhold Haeb-Umbach, and Romain Serizel. “Threshold Independent Evaluation of Sound Event Detection Scores.” In <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2022.","ieee":"J. Ebbers, R. Haeb-Umbach, and R. Serizel, “Threshold Independent Evaluation of Sound Event Detection Scores,” 2022."},"date_updated":"2023-11-22T08:26:58Z","oa":"1","author":[{"first_name":"Janek","id":"34851","full_name":"Ebbers, Janek","last_name":"Ebbers"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"},{"last_name":"Serizel","full_name":"Serizel, Romain","first_name":"Romain"}],"date_created":"2022-11-14T12:17:03Z","title":"Threshold Independent Evaluation of Sound Event Detection Scores"},{"abstract":[{"text":"In this report we present our system for the Detection and Classification of Acoustic Scenes and Events (DCASE) 2022 Challenge Task 4: Sound Event Detection in Domestic Environments 1 . As in previous editions of the Challenge, we use forward-backward convolutional recurrent neural networks (FBCRNNs) [1, 2] for weakly labeled and semi-supervised sound event detection (SED) and eventually generate strong pseudo labels for weakly labeled and unlabeled data. Then, (tag-conditioned) bidirectional CRNNs (Bi-CRNNs) [1, 2] are trained in a strongly supervised manner as our final SED models. In each of the training stages we use multiple iterations of self-training. Compared to previous editions, we improved our system performance by 1) some tweaks regarding data augmentation, pseudo labeling and inference 2) using weakly labeled AudioSet data [3] for pretraining larger networks and 3) augmenting the DESED data [4] with strongly labeled AudioSet data [5] for finetuning of the networks. Source code is publicly available at https://github.com/fgnt/pb_sed.","lang":"eng"}],"file":[{"relation":"main_file","success":1,"content_type":"application/pdf","access_level":"closed","file_id":"49114","file_name":"dcase2022_tech_report_ebbers.pdf","file_size":491650,"creator":"ebbers","date_created":"2023-11-22T08:35:23Z","date_updated":"2023-11-22T08:35:23Z"}],"status":"public","type":"report","ddc":["000"],"file_date_updated":"2023-11-22T08:35:23Z","language":[{"iso":"eng"}],"project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"49113","user_id":"34851","department":[{"_id":"54"}],"year":"2022","citation":{"ama":"Ebbers J, Haeb-Umbach R. <i>Pre-Training And Self-Training For Sound Event Detection In Domestic Environments</i>.; 2022.","chicago":"Ebbers, Janek, and Reinhold Haeb-Umbach. <i>Pre-Training And Self-Training For Sound Event Detection In Domestic Environments</i>, 2022.","ieee":"J. Ebbers and R. Haeb-Umbach, <i>Pre-Training And Self-Training For Sound Event Detection In Domestic Environments</i>. 2022.","apa":"Ebbers, J., &#38; Haeb-Umbach, R. (2022). <i>Pre-Training And Self-Training For Sound Event Detection In Domestic Environments</i>.","mla":"Ebbers, Janek, and Reinhold Haeb-Umbach. <i>Pre-Training And Self-Training For Sound Event Detection In Domestic Environments</i>. 2022.","bibtex":"@book{Ebbers_Haeb-Umbach_2022, title={Pre-Training And Self-Training For Sound Event Detection In Domestic Environments}, author={Ebbers, Janek and Haeb-Umbach, Reinhold}, year={2022} }","short":"J. Ebbers, R. Haeb-Umbach, Pre-Training And Self-Training For Sound Event Detection In Domestic Environments, 2022."},"has_accepted_license":"1","title":"Pre-Training And Self-Training For Sound Event Detection In Domestic Environments","date_updated":"2024-11-15T20:34:52Z","author":[{"first_name":"Janek","id":"34851","full_name":"Ebbers, Janek","last_name":"Ebbers"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold"}],"date_created":"2023-11-22T08:34:23Z"},{"language":[{"iso":"eng"}],"ddc":["000"],"file":[{"relation":"main_file","content_type":"application/pdf","file_size":236628,"file_id":"29305","file_name":"Template.pdf","access_level":"open_access","date_updated":"2022-01-13T08:19:19Z","creator":"ebbers","date_created":"2022-01-13T07:56:30Z"}],"abstract":[{"lang":"eng","text":"In this work we address disentanglement of style and content in speech signals. We propose a fully convolutional variational autoencoder employing two encoders: a content encoder and a style encoder. To foster disentanglement, we propose adversarial contrastive predictive coding. This new disentanglement method does neither need parallel data nor any supervision. We show that the proposed technique is capable of separating speaker and content traits into the two different representations and show competitive speaker-content disentanglement performance compared to other unsupervised approaches. We further demonstrate an increased robustness of the content representation against a train-test mismatch compared to spectral features, when used for phone recognition."}],"publication":"Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","title":"Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations","date_created":"2022-01-13T07:55:29Z","year":"2021","quality_controlled":"1","file_date_updated":"2022-01-13T08:19:19Z","user_id":"34851","department":[{"_id":"54"}],"project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"29304","status":"public","type":"conference","author":[{"first_name":"Janek","last_name":"Ebbers","id":"34851","full_name":"Ebbers, Janek"},{"full_name":"Kuhlmann, Michael","id":"49871","last_name":"Kuhlmann","first_name":"Michael"},{"last_name":"Cord-Landwehr","full_name":"Cord-Landwehr, Tobias","id":"44393","first_name":"Tobias"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"date_updated":"2023-11-22T08:29:42Z","oa":"1","citation":{"ieee":"J. Ebbers, M. Kuhlmann, T. Cord-Landwehr, and R. Haeb-Umbach, “Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations,” in <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2021, pp. 3860–3864.","chicago":"Ebbers, Janek, Michael Kuhlmann, Tobias Cord-Landwehr, and Reinhold Haeb-Umbach. “Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations.” In <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 3860–3864, 2021.","ama":"Ebbers J, Kuhlmann M, Cord-Landwehr T, Haeb-Umbach R. Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations. In: <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. ; 2021:3860–3864.","apa":"Ebbers, J., Kuhlmann, M., Cord-Landwehr, T., &#38; Haeb-Umbach, R. (2021). Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations. <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 3860–3864.","bibtex":"@inproceedings{Ebbers_Kuhlmann_Cord-Landwehr_Haeb-Umbach_2021, title={Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations}, booktitle={Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={Ebbers, Janek and Kuhlmann, Michael and Cord-Landwehr, Tobias and Haeb-Umbach, Reinhold}, year={2021}, pages={3860–3864} }","short":"J. Ebbers, M. Kuhlmann, T. Cord-Landwehr, R. Haeb-Umbach, in: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021, pp. 3860–3864.","mla":"Ebbers, Janek, et al. “Contrastive Predictive Coding Supported Factorized Variational Autoencoder for Unsupervised Learning of Disentangled Speech Representations.” <i>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2021, pp. 3860–3864."},"page":"3860–3864","has_accepted_license":"1"},{"title":"Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments","date_created":"2022-01-13T08:07:47Z","year":"2021","quality_controlled":"1","language":[{"iso":"eng"}],"ddc":["000"],"file":[{"date_updated":"2022-01-13T08:19:50Z","creator":"ebbers","date_created":"2022-01-13T08:08:54Z","file_size":239462,"file_id":"29309","access_level":"open_access","file_name":"template.pdf","content_type":"application/pdf","relation":"main_file"}],"abstract":[{"lang":"eng","text":"In this paper we present our system for the Detection and Classification of Acoustic Scenes and Events (DCASE) 2021 Challenge Task 4: Sound Event Detection and Separation in Domestic Environments, where it scored the fourth rank. Our presented solution is an advancement of our system used in the previous edition of the task.We use a forward-backward convolutional recurrent neural network (FBCRNN) for tagging and pseudo labeling followed by tag-conditioned sound event detection (SED) models which are trained using strong pseudo labels provided by the FBCRNN. Our advancement over our earlier model is threefold. First, we introduce a strong label loss in the objective of the FBCRNN to take advantage of the strongly labeled synthetic data during training. Second, we perform multiple iterations of self-training for both the FBCRNN and tag-conditioned SED models. Third, while we used only tag-conditioned CNNs as our SED model in the previous edition we here explore sophisticated tag-conditioned SED model architectures, namely, bidirectional CRNNs and bidirectional convolutional transformer neural networks (CTNNs), and combine them. With metric and class specific tuning of median filter lengths for post-processing, our final SED model, consisting of 6 submodels (2 of each architecture), achieves on the public evaluation set poly-phonic sound event detection scores (PSDS) of 0.455 for scenario 1 and 0.684 for scenario as well as a collar-based F1-score of 0.596 outperforming the baselines and our model from the previous edition by far. Source code is publicly available at https://github.com/fgnt/pb_sed."}],"publication":"Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)","author":[{"first_name":"Janek","id":"34851","full_name":"Ebbers, Janek","last_name":"Ebbers"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"}],"date_updated":"2023-11-22T08:28:32Z","oa":"1","citation":{"apa":"Ebbers, J., &#38; Haeb-Umbach, R. (2021). Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments. <i>Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)</i>, 226–230.","short":"J. Ebbers, R. Haeb-Umbach, in: Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021), Barcelona, Spain, 2021, pp. 226–230.","bibtex":"@inproceedings{Ebbers_Haeb-Umbach_2021, place={Barcelona, Spain}, title={Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments}, booktitle={Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)}, author={Ebbers, Janek and Haeb-Umbach, Reinhold}, year={2021}, pages={226–230} }","mla":"Ebbers, Janek, and Reinhold Haeb-Umbach. “Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments.” <i>Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)</i>, 2021, pp. 226–230.","ama":"Ebbers J, Haeb-Umbach R. Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments. In: <i>Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)</i>. ; 2021:226–230.","chicago":"Ebbers, Janek, and Reinhold Haeb-Umbach. “Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments.” In <i>Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)</i>, 226–230. Barcelona, Spain, 2021.","ieee":"J. Ebbers and R. Haeb-Umbach, “Self-Trained Audio Tagging and Sound Event Detection in Domestic Environments,” in <i>Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)</i>, 2021, pp. 226–230."},"page":"226–230","place":"Barcelona, Spain","publication_identifier":{"isbn":["978-84-09-36072-7"]},"has_accepted_license":"1","file_date_updated":"2022-01-13T08:19:50Z","user_id":"34851","department":[{"_id":"54"}],"project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"29308","status":"public","type":"conference"},{"title":"Adapting Sound Recognition to A New Environment Via Self-Training","date_updated":"2023-11-22T08:28:50Z","oa":"1","date_created":"2022-01-13T08:01:21Z","author":[{"last_name":"Ebbers","id":"34851","full_name":"Ebbers, Janek","first_name":"Janek"},{"last_name":"Keyser","full_name":"Keyser, Moritz Curt","first_name":"Moritz Curt"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"}],"year":"2021","page":"1135–1139","citation":{"ama":"Ebbers J, Keyser MC, Haeb-Umbach R. Adapting Sound Recognition to A New Environment Via Self-Training. In: <i>Proceedings of the 29th European Signal Processing Conference (EUSIPCO)</i>. ; 2021:1135–1139.","ieee":"J. Ebbers, M. C. Keyser, and R. Haeb-Umbach, “Adapting Sound Recognition to A New Environment Via Self-Training,” in <i>Proceedings of the 29th European Signal Processing Conference (EUSIPCO)</i>, 2021, pp. 1135–1139.","chicago":"Ebbers, Janek, Moritz Curt Keyser, and Reinhold Haeb-Umbach. “Adapting Sound Recognition to A New Environment Via Self-Training.” In <i>Proceedings of the 29th European Signal Processing Conference (EUSIPCO)</i>, 1135–1139, 2021.","bibtex":"@inproceedings{Ebbers_Keyser_Haeb-Umbach_2021, title={Adapting Sound Recognition to A New Environment Via Self-Training}, booktitle={Proceedings of the 29th European Signal Processing Conference (EUSIPCO)}, author={Ebbers, Janek and Keyser, Moritz Curt and Haeb-Umbach, Reinhold}, year={2021}, pages={1135–1139} }","mla":"Ebbers, Janek, et al. “Adapting Sound Recognition to A New Environment Via Self-Training.” <i>Proceedings of the 29th European Signal Processing Conference (EUSIPCO)</i>, 2021, pp. 1135–1139.","short":"J. Ebbers, M.C. Keyser, R. Haeb-Umbach, in: Proceedings of the 29th European Signal Processing Conference (EUSIPCO), 2021, pp. 1135–1139.","apa":"Ebbers, J., Keyser, M. C., &#38; Haeb-Umbach, R. (2021). Adapting Sound Recognition to A New Environment Via Self-Training. <i>Proceedings of the 29th European Signal Processing Conference (EUSIPCO)</i>, 1135–1139."},"has_accepted_license":"1","quality_controlled":"1","ddc":["000"],"language":[{"iso":"eng"}],"file_date_updated":"2022-01-13T08:19:35Z","_id":"29306","project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"department":[{"_id":"54"}],"user_id":"34851","abstract":[{"lang":"eng","text":"Recently, there has been a rising interest in sound recognition via Acoustic Sensor Networks to support applications such as ambient assisted living or environmental habitat monitoring. With state-of-the-art sound recognition being dominated by deep-learning-based approaches, there is a high demand for labeled training data. Despite the availability of large-scale  data sets such as Google's AudioSet, acquiring training data matching a certain application environment is still often a problem. In this paper we are concerned with human activity monitoring in a domestic environment using an ASN consisting of multiple nodes each providing multichannel signals. We propose a self-training based domain adaptation approach, which only requires unlabeled data from the target environment. Here, a sound recognition system trained on AudioSet, the teacher, generates pseudo labels for data from the target environment on which a student network is trained. The student can furthermore glean information about the spatial arrangement of sensors and sound sources to further improve classification performance. It is shown that  the student significantly improves recognition performance over the pre-trained teacher without relying on labeled data from the environment the system is deployed in."}],"status":"public","file":[{"relation":"main_file","content_type":"application/pdf","file_id":"29307","file_name":"conference_101719.pdf","access_level":"open_access","file_size":213938,"creator":"ebbers","date_created":"2022-01-13T08:03:26Z","date_updated":"2022-01-13T08:19:35Z"}],"publication":"Proceedings of the 29th European Signal Processing Conference (EUSIPCO)","type":"conference"},{"year":"2020","quality_controlled":"1","title":"Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection","date_created":"2020-12-16T08:55:27Z","file":[{"date_updated":"2020-12-16T08:57:22Z","date_created":"2020-12-16T08:57:22Z","creator":"huesera","file_size":108326,"file_name":"DCASE2020Workshop_Ebbers_Paper.pdf","access_level":"open_access","file_id":"20754","content_type":"application/pdf","relation":"main_file"}],"abstract":[{"lang":"eng","text":"In this paper we present our system for the detection and classification of acoustic scenes and events (DCASE) 2020 Challenge Task 4: Sound event detection and separation in domestic environments. We introduce two new models: the forward-backward convolutional recurrent neural network (FBCRNN) and the tag-conditioned convolutional neural network (CNN). The FBCRNN employs two recurrent neural network (RNN) classifiers sharing the same CNN for preprocessing. With one RNN processing a recording in forward direction and the other in backward direction, the two networks are trained to jointly predict audio tags, i.e., weak labels, at each time step within a recording, given that at each time step they have jointly processed the whole recording. The proposed training encourages the classifiers to tag events as soon as possible. Therefore, after training, the networks can be applied to shorter audio segments of, e.g., 200ms, allowing sound event detection (SED). Further, we propose a tag-conditioned CNN to complement SED. It is trained to predict strong labels while using (predicted) tags, i.e., weak labels, as additional input. For training pseudo strong labels from a FBCRNN ensemble are used. The presented system scored the fourth and third place in the systems and teams rankings, respectively. Subsequent improvements allow our system to even outperform the challenge baseline and winner systems in average by, respectively, 18.0% and 2.2% event-based F1-score on the validation set. Source code is publicly available at https://github.com/fgnt/pb_sed."}],"publication":"Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)","language":[{"iso":"eng"}],"ddc":["000"],"citation":{"bibtex":"@inproceedings{Ebbers_Haeb-Umbach_2020, title={Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection}, booktitle={Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)}, author={Ebbers, Janek and Haeb-Umbach, Reinhold}, year={2020} }","short":"J. Ebbers, R. Haeb-Umbach, in: Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020), 2020.","mla":"Ebbers, Janek, and Reinhold Haeb-Umbach. “Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection.” <i>Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)</i>, 2020.","apa":"Ebbers, J., &#38; Haeb-Umbach, R. (2020). Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection. <i>Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)</i>.","ieee":"J. Ebbers and R. Haeb-Umbach, “Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection,” 2020.","chicago":"Ebbers, Janek, and Reinhold Haeb-Umbach. “Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection.” In <i>Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)</i>, 2020.","ama":"Ebbers J, Haeb-Umbach R. Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection. In: <i>Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)</i>. ; 2020."},"has_accepted_license":"1","author":[{"first_name":"Janek","last_name":"Ebbers","full_name":"Ebbers, Janek","id":"34851"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"}],"oa":"1","date_updated":"2023-11-22T08:27:32Z","status":"public","type":"conference","file_date_updated":"2020-12-16T08:57:22Z","department":[{"_id":"54"}],"user_id":"34851","_id":"20753","project":[{"_id":"52","name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing"}]},{"type":"conference","publication":"Proc. 10th ISCA Speech Synthesis Workshop","status":"public","abstract":[{"text":"This  paper  presents  an  approach  to  voice  conversion,  whichdoes neither require parallel data nor speaker or phone labels fortraining.  It can convert between speakers which are not in thetraining set by employing the previously proposed concept of afactorized hierarchical variational autoencoder. Here, linguisticand speaker induced variations are separated upon the notionthat content induced variations change at a much shorter timescale, i.e., at the segment level, than speaker induced variations,which vary at the longer utterance level. In this contribution wepropose to employ convolutional instead of recurrent networklayers  in  the  encoder  and  decoder  blocks,  which  is  shown  toachieve better phone recognition accuracy on the latent segmentvariables at frame-level due to their better temporal resolution.For voice conversion the mean of the utterance variables is re-placed with the respective estimated mean of the target speaker.The resulting log-mel spectra of the decoder output are used aslocal conditions of a WaveNet which is utilized for synthesis ofthe speech waveforms.  Experiments show both good disentan-glement properties of the latent space variables, and good voiceconversion performance.","lang":"eng"}],"user_id":"44006","department":[{"_id":"54"}],"_id":"15237","language":[{"iso":"eng"}],"related_material":{"link":[{"description":"Listening examples","relation":"supplementary_material","url":"http://go.upb.de/vcex"}]},"quality_controlled":"1","citation":{"chicago":"Gburrek, Tobias, Thomas Glarner, Janek Ebbers, Reinhold Haeb-Umbach, and Petra Wagner. “Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion.” In <i>Proc. 10th ISCA Speech Synthesis Workshop</i>, 81–86, 2019. <a href=\"https://doi.org/10.21437/SSW.2019-15\">https://doi.org/10.21437/SSW.2019-15</a>.","ieee":"T. Gburrek, T. Glarner, J. Ebbers, R. Haeb-Umbach, and P. Wagner, “Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion,” in <i>Proc. 10th ISCA Speech Synthesis Workshop</i>, Vienna, 2019, pp. 81–86, doi: <a href=\"https://doi.org/10.21437/SSW.2019-15\">10.21437/SSW.2019-15</a>.","ama":"Gburrek T, Glarner T, Ebbers J, Haeb-Umbach R, Wagner P. Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion. In: <i>Proc. 10th ISCA Speech Synthesis Workshop</i>. ; 2019:81-86. doi:<a href=\"https://doi.org/10.21437/SSW.2019-15\">10.21437/SSW.2019-15</a>","apa":"Gburrek, T., Glarner, T., Ebbers, J., Haeb-Umbach, R., &#38; Wagner, P. (2019). Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion. <i>Proc. 10th ISCA Speech Synthesis Workshop</i>, 81–86. <a href=\"https://doi.org/10.21437/SSW.2019-15\">https://doi.org/10.21437/SSW.2019-15</a>","mla":"Gburrek, Tobias, et al. “Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion.” <i>Proc. 10th ISCA Speech Synthesis Workshop</i>, 2019, pp. 81–86, doi:<a href=\"https://doi.org/10.21437/SSW.2019-15\">10.21437/SSW.2019-15</a>.","short":"T. Gburrek, T. Glarner, J. Ebbers, R. Haeb-Umbach, P. Wagner, in: Proc. 10th ISCA Speech Synthesis Workshop, 2019, pp. 81–86.","bibtex":"@inproceedings{Gburrek_Glarner_Ebbers_Haeb-Umbach_Wagner_2019, title={Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion}, DOI={<a href=\"https://doi.org/10.21437/SSW.2019-15\">10.21437/SSW.2019-15</a>}, booktitle={Proc. 10th ISCA Speech Synthesis Workshop}, author={Gburrek, Tobias and Glarner, Thomas and Ebbers, Janek and Haeb-Umbach, Reinhold and Wagner, Petra}, year={2019}, pages={81–86} }"},"page":"81-86","year":"2019","date_created":"2019-12-04T08:12:29Z","author":[{"last_name":"Gburrek","id":"44006","full_name":"Gburrek, Tobias","first_name":"Tobias"},{"first_name":"Thomas","id":"14169","full_name":"Glarner, Thomas","last_name":"Glarner"},{"first_name":"Janek","last_name":"Ebbers","full_name":"Ebbers, Janek","id":"34851"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"},{"first_name":"Petra","last_name":"Wagner","full_name":"Wagner, Petra"}],"date_updated":"2023-11-17T06:20:39Z","oa":"1","main_file_link":[{"open_access":"1","url":"https://www.isca-speech.org/archive/pdfs/ssw_2019/gburrek19_ssw.pdf"}],"doi":"10.21437/SSW.2019-15","conference":{"location":"Vienna","name":"10th ISCA Speech Synthesis Workshop"},"title":"Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion"},{"ddc":["000"],"language":[{"iso":"eng"}],"file_date_updated":"2020-02-05T10:18:06Z","project":[{"_id":"52","name":"Computing Resources Provided by the Paderborn Center for Parallel Computing"}],"_id":"15794","user_id":"34851","department":[{"_id":"54"}],"abstract":[{"lang":"eng","text":"In this paper we present our audio tagging system for the DCASE 2019 Challenge Task 2. We propose a model consisting of a convolutional front end using log-mel-energies as input features, a recurrent neural network sequence encoder and a fully connected classifier network outputting an activity probability for each of the 80 considered event classes. Due to the recurrent neural network, which encodes a whole sequence into a single vector, our model is able to process sequences of varying lengths. The model is trained with only little manually labeled training data and a larger amount of automatically labeled web data, which hence suffers from label noise. To efficiently train the model with the provided data we use various data augmentation to prevent overfitting and improve generalization. Our best submitted system achieves a label-weighted label-ranking average precision (lwlrap) of 75.5% on the private test set which is an absolute improvement of 21.7% over the baseline. This system scored the second place in the teams ranking of the DCASE 2019 Challenge Task 2 and the fifth place in the Kaggle competition “Freesound Audio Tagging 2019” with more than 400 participants. After the challenge ended we further improved performance to 76.5% lwlrap setting a new state-of-the-art on this dataset."}],"file":[{"content_type":"application/pdf","relation":"main_file","date_updated":"2020-02-05T10:18:06Z","date_created":"2020-02-05T10:18:06Z","creator":"huesera","file_size":184967,"file_id":"15795","access_level":"open_access","file_name":"DCASE_2019_WS_Ebbers_Paper.pdf"}],"status":"public","type":"conference","publication":"DCASE2019 Workshop, New York, USA","title":"Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision","oa":"1","date_updated":"2023-11-22T08:30:12Z","date_created":"2020-02-05T10:16:03Z","author":[{"id":"34851","full_name":"Ebbers, Janek","last_name":"Ebbers","first_name":"Janek"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"}],"year":"2019","citation":{"chicago":"Ebbers, Janek, and Reinhold Haeb-Umbach. “Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision.” In <i>DCASE2019 Workshop, New York, USA</i>, 2019.","ieee":"J. Ebbers and R. Haeb-Umbach, “Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision,” 2019.","ama":"Ebbers J, Haeb-Umbach R. Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision. In: <i>DCASE2019 Workshop, New York, USA</i>. ; 2019.","apa":"Ebbers, J., &#38; Haeb-Umbach, R. (2019). Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision. <i>DCASE2019 Workshop, New York, USA</i>.","short":"J. Ebbers, R. Haeb-Umbach, in: DCASE2019 Workshop, New York, USA, 2019.","mla":"Ebbers, Janek, and Reinhold Haeb-Umbach. “Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision.” <i>DCASE2019 Workshop, New York, USA</i>, 2019.","bibtex":"@inproceedings{Ebbers_Haeb-Umbach_2019, title={Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision}, booktitle={DCASE2019 Workshop, New York, USA}, author={Ebbers, Janek and Haeb-Umbach, Reinhold}, year={2019} }"},"quality_controlled":"1","has_accepted_license":"1"},{"file":[{"content_type":"application/pdf","relation":"main_file","date_updated":"2020-02-05T10:21:39Z","date_created":"2020-02-05T10:21:39Z","creator":"huesera","file_size":311887,"access_level":"open_access","file_id":"15797","file_name":"CAMSAP_2019_WS_Ebbers_Paper.pdf"}],"status":"public","abstract":[{"lang":"eng","text":"In this paper we consider human daily activity recognition using an acoustic sensor network (ASN) which consists of nodes distributed in a home environment. Assuming that the ASN is permanently recording, the vast majority of recordings is silence. Therefore, we propose to employ a computationally efficient two-stage sound recognition system, consisting of an initial sound activity detection (SAD) and a subsequent sound event classification (SEC), which is only activated once sound activity has been detected. We show how a low-latency activity detector with high temporal resolution can be trained from weak labels with low temporal resolution. We further demonstrate the advantage of using spatial features for the subsequent event classification task."}],"type":"conference","publication":"CAMSAP 2019, Guadeloupe, West Indies","language":[{"iso":"eng"}],"file_date_updated":"2020-02-05T10:21:39Z","ddc":["000"],"user_id":"34851","department":[{"_id":"54"}],"project":[{"name":"Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"}],"_id":"15796","citation":{"apa":"Ebbers, J., Drude, L., Haeb-Umbach, R., Brendel, A., &#38; Kellermann, W. (2019). Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks. <i>CAMSAP 2019, Guadeloupe, West Indies</i>.","mla":"Ebbers, Janek, et al. “Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks.” <i>CAMSAP 2019, Guadeloupe, West Indies</i>, 2019.","short":"J. Ebbers, L. Drude, R. Haeb-Umbach, A. Brendel, W. Kellermann, in: CAMSAP 2019, Guadeloupe, West Indies, 2019.","bibtex":"@inproceedings{Ebbers_Drude_Haeb-Umbach_Brendel_Kellermann_2019, title={Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks}, booktitle={CAMSAP 2019, Guadeloupe, West Indies}, author={Ebbers, Janek and Drude, Lukas and Haeb-Umbach, Reinhold and Brendel, Andreas and Kellermann, Walter}, year={2019} }","ama":"Ebbers J, Drude L, Haeb-Umbach R, Brendel A, Kellermann W. Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks. In: <i>CAMSAP 2019, Guadeloupe, West Indies</i>. ; 2019.","ieee":"J. Ebbers, L. Drude, R. Haeb-Umbach, A. Brendel, and W. Kellermann, “Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks,” 2019.","chicago":"Ebbers, Janek, Lukas Drude, Reinhold Haeb-Umbach, Andreas Brendel, and Walter Kellermann. “Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks.” In <i>CAMSAP 2019, Guadeloupe, West Indies</i>, 2019."},"year":"2019","has_accepted_license":"1","quality_controlled":"1","title":"Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks","author":[{"first_name":"Janek","last_name":"Ebbers","full_name":"Ebbers, Janek","id":"34851"},{"last_name":"Drude","full_name":"Drude, Lukas","id":"11213","first_name":"Lukas"},{"first_name":"Reinhold","last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242"},{"first_name":"Andreas","last_name":"Brendel","full_name":"Brendel, Andreas"},{"full_name":"Kellermann, Walter","last_name":"Kellermann","first_name":"Walter"}],"date_created":"2020-02-05T10:20:17Z","date_updated":"2023-11-22T08:29:58Z","oa":"1"},{"status":"public","file":[{"relation":"main_file","content_type":"application/pdf","file_size":454600,"file_name":"INTERSPEECH_2019_Ebbers_Paper.pdf","access_level":"open_access","file_id":"15793","date_updated":"2020-02-05T10:11:40Z","creator":"huesera","date_created":"2020-02-05T10:11:40Z"}],"abstract":[{"text":"In this paper we highlight the privacy risks entailed in deep neural network feature extraction for domestic activity monitoring. We employ the baseline system proposed in the Task 5 of the DCASE 2018 challenge and simulate a feature interception attack by an eavesdropper who wants to perform speaker identification. We then propose to reduce the aforementioned privacy risks by introducing a variational information feature extraction scheme that allows for good activity monitoring performance while at the same time minimizing the information of the feature representation, thus restricting speaker identification attempts. We analyze the resulting model’s composite loss function and the budget scaling factor used to control the balance between the performance of the trusted and attacker tasks. It is empirically demonstrated that the proposed method reduces speaker identification privacy risks without significantly deprecating the performance of domestic activity monitoring tasks.","lang":"eng"}],"publication":"INTERSPEECH 2019, Graz, Austria","type":"conference","language":[{"iso":"eng"}],"file_date_updated":"2020-02-05T10:11:40Z","ddc":["000"],"department":[{"_id":"54"}],"user_id":"34851","_id":"15792","citation":{"apa":"Nelus, A., Ebbers, J., Haeb-Umbach, R., &#38; Martin, R. (2019). Privacy-preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification. <i>INTERSPEECH 2019, Graz, Austria</i>.","bibtex":"@inproceedings{Nelus_Ebbers_Haeb-Umbach_Martin_2019, title={Privacy-preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification}, booktitle={INTERSPEECH 2019, Graz, Austria}, author={Nelus, Alexandru and Ebbers, Janek and Haeb-Umbach, Reinhold and Martin, Rainer}, year={2019} }","mla":"Nelus, Alexandru, et al. “Privacy-Preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification.” <i>INTERSPEECH 2019, Graz, Austria</i>, 2019.","short":"A. Nelus, J. Ebbers, R. Haeb-Umbach, R. Martin, in: INTERSPEECH 2019, Graz, Austria, 2019.","ama":"Nelus A, Ebbers J, Haeb-Umbach R, Martin R. Privacy-preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification. In: <i>INTERSPEECH 2019, Graz, Austria</i>. ; 2019.","ieee":"A. Nelus, J. Ebbers, R. Haeb-Umbach, and R. Martin, “Privacy-preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification,” 2019.","chicago":"Nelus, Alexandru, Janek Ebbers, Reinhold Haeb-Umbach, and Rainer Martin. “Privacy-Preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification.” In <i>INTERSPEECH 2019, Graz, Austria</i>, 2019."},"year":"2019","has_accepted_license":"1","quality_controlled":"1","title":"Privacy-preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification","date_created":"2020-02-05T10:07:53Z","author":[{"last_name":"Nelus","full_name":"Nelus, Alexandru","first_name":"Alexandru"},{"first_name":"Janek","last_name":"Ebbers","full_name":"Ebbers, Janek","id":"34851"},{"last_name":"Haeb-Umbach","id":"242","full_name":"Haeb-Umbach, Reinhold","first_name":"Reinhold"},{"full_name":"Martin, Rainer","last_name":"Martin","first_name":"Rainer"}],"date_updated":"2023-11-22T08:27:55Z","oa":"1"},{"year":"2018","citation":{"ama":"Ebbers J, Nelus A, Martin R, Haeb-Umbach R. Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection. In: <i>DAGA 2018, München</i>. ; 2018.","chicago":"Ebbers, Janek, Alexandru Nelus, Rainer Martin, and Reinhold Haeb-Umbach. “Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection.” In <i>DAGA 2018, München</i>, 2018.","ieee":"J. Ebbers, A. Nelus, R. Martin, and R. Haeb-Umbach, “Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection,” in <i>DAGA 2018, München</i>, 2018.","apa":"Ebbers, J., Nelus, A., Martin, R., &#38; Haeb-Umbach, R. (2018). Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection. In <i>DAGA 2018, München</i>.","bibtex":"@inproceedings{Ebbers_Nelus_Martin_Haeb-Umbach_2018, title={Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection}, booktitle={DAGA 2018, München}, author={Ebbers, Janek and Nelus, Alexandru and Martin, Rainer and Haeb-Umbach, Reinhold}, year={2018} }","mla":"Ebbers, Janek, et al. “Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection.” <i>DAGA 2018, München</i>, 2018.","short":"J. Ebbers, A. Nelus, R. Martin, R. Haeb-Umbach, in: DAGA 2018, München, 2018."},"title":"Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2018/Daga_2018_Ebbers_Paper.pdf"}],"oa":"1","date_updated":"2022-01-06T06:51:08Z","date_created":"2019-07-12T05:27:43Z","author":[{"first_name":"Janek","last_name":"Ebbers","full_name":"Ebbers, Janek","id":"34851"},{"last_name":"Nelus","full_name":"Nelus, Alexandru","first_name":"Alexandru"},{"last_name":"Martin","full_name":"Martin, Rainer","first_name":"Rainer"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"abstract":[{"lang":"eng","text":"Acoustic event detection, i.e., the task of assigning a human interpretable label to a segment of audio, has only recently attracted increased interest in the research community. Driven by the DCASE challenges and the availability of large-scale audio datasets, the state-of-the-art has progressed rapidly with deep-learning-based classi- fiers dominating the field. Because several potential use cases favor a realization on distributed sensor nodes, e.g. ambient assisted living applications, habitat monitoring or surveillance, we are concerned with two issues here. Firstly the classification performance of such systems and secondly the computing resources required to achieve a certain performance considering node level feature extraction. In this contribution we look at the balance between the two criteria by employing traditional techniques and different deep learning architectures, including convolutional and recurrent models in the context of real life everyday audio recordings in realistic, however challenging, multisource conditions."}],"status":"public","publication":"DAGA 2018, München","type":"conference","language":[{"iso":"eng"}],"_id":"11760","department":[{"_id":"54"}],"user_id":"44006"},{"language":[{"iso":"eng"}],"_id":"11907","department":[{"_id":"54"}],"user_id":"34851","abstract":[{"lang":"eng","text":"The invention of the Variational Autoencoder enables the application of Neural Networks to a wide range of tasks in unsupervised learning, including the field of Acoustic Unit Discovery (AUD). The recently proposed Hidden Markov Model Variational Autoencoder (HMMVAE) allows a joint training of a neural network based feature extractor and a structured prior for the latent space given by a Hidden Markov Model. It has been shown that the HMMVAE significantly outperforms pure GMM-HMM based systems on the AUD task. However, the HMMVAE cannot autonomously infer the number of acoustic units and thus relies on the GMM-HMM system for initialization. This paper introduces the Bayesian Hidden Markov Model Variational Autoencoder (BHMMVAE) which solves these issues by embedding the HMMVAE in a Bayesian framework with a Dirichlet Process Prior for the distribution of the acoustic units, and diagonal or full-covariance Gaussians as emission distributions. Experiments on TIMIT and Xitsonga show that the BHMMVAE is able to autonomously infer a reasonable number of acoustic units, can be initialized without supervision by a GMM-HMM system, achieves computationally efficient stochastic variational inference by using natural gradient descent, and, additionally, improves the AUD performance over the HMMVAE."}],"status":"public","publication":"INTERSPEECH 2018, Hyderabad, India","type":"conference","title":"Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Glarner_Paper.pdf"}],"date_updated":"2023-11-22T08:29:22Z","oa":"1","author":[{"id":"14169","full_name":"Glarner, Thomas","last_name":"Glarner","first_name":"Thomas"},{"last_name":"Hanebrink","full_name":"Hanebrink, Patrick","first_name":"Patrick"},{"full_name":"Ebbers, Janek","id":"34851","last_name":"Ebbers","first_name":"Janek"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"date_created":"2019-07-12T05:30:34Z","year":"2018","citation":{"ama":"Glarner T, Hanebrink P, Ebbers J, Haeb-Umbach R. Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery. In: <i>INTERSPEECH 2018, Hyderabad, India</i>. ; 2018.","chicago":"Glarner, Thomas, Patrick Hanebrink, Janek Ebbers, and Reinhold Haeb-Umbach. “Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery.” In <i>INTERSPEECH 2018, Hyderabad, India</i>, 2018.","ieee":"T. Glarner, P. Hanebrink, J. Ebbers, and R. Haeb-Umbach, “Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery,” 2018.","bibtex":"@inproceedings{Glarner_Hanebrink_Ebbers_Haeb-Umbach_2018, title={Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery}, booktitle={INTERSPEECH 2018, Hyderabad, India}, author={Glarner, Thomas and Hanebrink, Patrick and Ebbers, Janek and Haeb-Umbach, Reinhold}, year={2018} }","short":"T. Glarner, P. Hanebrink, J. Ebbers, R. Haeb-Umbach, in: INTERSPEECH 2018, Hyderabad, India, 2018.","mla":"Glarner, Thomas, et al. “Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery.” <i>INTERSPEECH 2018, Hyderabad, India</i>, 2018.","apa":"Glarner, T., Hanebrink, P., Ebbers, J., &#38; Haeb-Umbach, R. (2018). Full Bayesian Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery. <i>INTERSPEECH 2018, Hyderabad, India</i>."},"quality_controlled":"1","related_material":{"link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/INTERSPEECH_2018_Glarner_Slides.pdf","description":"Slides","relation":"supplementary_material"}]}},{"language":[{"iso":"eng"}],"user_id":"460","department":[{"_id":"54"}],"_id":"11836","status":"public","abstract":[{"text":"Due to their distributed nature wireless acoustic sensor networks offer great potential for improved signal acquisition, processing and classification for applications such as monitoring and surveillance, home automation, or hands-free telecommunication. To reduce the communication demand with a central server and to raise the privacy level it is desirable to perform processing at node level. The limited processing and memory capabilities on a sensor node, however, stand in contrast to the compute and memory intensive deep learning algorithms used in modern speech and audio processing. In this work, we perform benchmarking of commonly used convolutional and recurrent neural network architectures on a Raspberry Pi based acoustic sensor node. We show that it is possible to run medium-sized neural network topologies used for speech enhancement and speech recognition in real time. For acoustic event recognition, where predictions in a lower temporal resolution are sufficient, it is even possible to run current state-of-the-art deep convolutional models with a real-time-factor of 0:11.","lang":"eng"}],"type":"conference","publication":"ITG 2018, Oldenburg, Germany","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/ITG_2018_Ebbers_Paper.pdf","open_access":"1"}],"title":"Benchmarking Neural Network Architectures for Acoustic Sensor Networks","date_created":"2019-07-12T05:29:11Z","author":[{"last_name":"Ebbers","id":"34851","full_name":"Ebbers, Janek","first_name":"Janek"},{"first_name":"Jens","full_name":"Heitkaemper, Jens","id":"27643","last_name":"Heitkaemper"},{"last_name":"Schmalenstroeer","full_name":"Schmalenstroeer, Joerg","id":"460","first_name":"Joerg"},{"first_name":"Reinhold","full_name":"Haeb-Umbach, Reinhold","id":"242","last_name":"Haeb-Umbach"}],"date_updated":"2023-10-26T08:12:40Z","oa":"1","citation":{"apa":"Ebbers, J., Heitkaemper, J., Schmalenstroeer, J., &#38; Haeb-Umbach, R. (2018). Benchmarking Neural Network Architectures for Acoustic Sensor Networks. <i>ITG 2018, Oldenburg, Germany</i>.","short":"J. Ebbers, J. Heitkaemper, J. Schmalenstroeer, R. Haeb-Umbach, in: ITG 2018, Oldenburg, Germany, 2018.","mla":"Ebbers, Janek, et al. “Benchmarking Neural Network Architectures for Acoustic Sensor Networks.” <i>ITG 2018, Oldenburg, Germany</i>, 2018.","bibtex":"@inproceedings{Ebbers_Heitkaemper_Schmalenstroeer_Haeb-Umbach_2018, title={Benchmarking Neural Network Architectures for Acoustic Sensor Networks}, booktitle={ITG 2018, Oldenburg, Germany}, author={Ebbers, Janek and Heitkaemper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}, year={2018} }","ama":"Ebbers J, Heitkaemper J, Schmalenstroeer J, Haeb-Umbach R. Benchmarking Neural Network Architectures for Acoustic Sensor Networks. In: <i>ITG 2018, Oldenburg, Germany</i>. ; 2018.","ieee":"J. Ebbers, J. Heitkaemper, J. Schmalenstroeer, and R. Haeb-Umbach, “Benchmarking Neural Network Architectures for Acoustic Sensor Networks,” 2018.","chicago":"Ebbers, Janek, Jens Heitkaemper, Joerg Schmalenstroeer, and Reinhold Haeb-Umbach. “Benchmarking Neural Network Architectures for Acoustic Sensor Networks.” In <i>ITG 2018, Oldenburg, Germany</i>, 2018."},"year":"2018","related_material":{"link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2018/ITG_2018_Ebbers_Poster.pdf","relation":"supplementary_material","description":"Poster"}]},"quality_controlled":"1"},{"main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2017/INTERSPEECH_2017_Ebbers_paper.pdf","open_access":"1"}],"title":"Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery","date_created":"2019-07-12T05:27:42Z","author":[{"last_name":"Ebbers","full_name":"Ebbers, Janek","id":"34851","first_name":"Janek"},{"last_name":"Heymann","full_name":"Heymann, Jahn","id":"9168","first_name":"Jahn"},{"full_name":"Drude, Lukas","id":"11213","last_name":"Drude","first_name":"Lukas"},{"first_name":"Thomas","last_name":"Glarner","id":"14169","full_name":"Glarner, Thomas"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"},{"first_name":"Bhiksha","full_name":"Raj, Bhiksha","last_name":"Raj"}],"date_updated":"2023-11-22T08:29:06Z","oa":"1","citation":{"ama":"Ebbers J, Heymann J, Drude L, Glarner T, Haeb-Umbach R, Raj B. Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery. In: <i>INTERSPEECH 2017, Stockholm, Schweden</i>. ; 2017.","chicago":"Ebbers, Janek, Jahn Heymann, Lukas Drude, Thomas Glarner, Reinhold Haeb-Umbach, and Bhiksha Raj. “Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery.” In <i>INTERSPEECH 2017, Stockholm, Schweden</i>, 2017.","ieee":"J. Ebbers, J. Heymann, L. Drude, T. Glarner, R. Haeb-Umbach, and B. Raj, “Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery,” 2017.","apa":"Ebbers, J., Heymann, J., Drude, L., Glarner, T., Haeb-Umbach, R., &#38; Raj, B. (2017). Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery. <i>INTERSPEECH 2017, Stockholm, Schweden</i>.","bibtex":"@inproceedings{Ebbers_Heymann_Drude_Glarner_Haeb-Umbach_Raj_2017, title={Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery}, booktitle={INTERSPEECH 2017, Stockholm, Schweden}, author={Ebbers, Janek and Heymann, Jahn and Drude, Lukas and Glarner, Thomas and Haeb-Umbach, Reinhold and Raj, Bhiksha}, year={2017} }","short":"J. Ebbers, J. Heymann, L. Drude, T. Glarner, R. Haeb-Umbach, B. Raj, in: INTERSPEECH 2017, Stockholm, Schweden, 2017.","mla":"Ebbers, Janek, et al. “Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery.” <i>INTERSPEECH 2017, Stockholm, Schweden</i>, 2017."},"year":"2017","related_material":{"link":[{"description":"Poster","relation":"supplementary_material","url":"https://groups.uni-paderborn.de/nt/pubs/2017/INTERSPEECH_2017_Ebbers_poster.pdf"},{"description":"Slides","relation":"supplementary_material","url":"https://groups.uni-paderborn.de/nt/pubs/2017/INTERSPEECH_2017_Ebbers_slides.pdf"}]},"quality_controlled":"1","language":[{"iso":"eng"}],"user_id":"34851","department":[{"_id":"54"}],"_id":"11759","status":"public","abstract":[{"text":"Variational Autoencoders (VAEs) have been shown to provide efficient neural-network-based approximate Bayesian inference for observation models for which exact inference is intractable. Its extension, the so-called Structured VAE (SVAE) allows inference in the presence of both discrete and continuous latent variables. Inspired by this extension, we developed a VAE with Hidden Markov Models (HMMs) as latent models. We applied the resulting HMM-VAE to the task of acoustic unit discovery in a zero resource scenario. Starting from an initial model based on variational inference in an HMM with Gaussian Mixture Model (GMM) emission probabilities, the accuracy of the acoustic unit discovery could be significantly improved by the HMM-VAE. In doing so we were able to demonstrate for an unsupervised learning task what is well-known in the supervised learning case: Neural networks provide superior modeling power compared to GMMs.","lang":"eng"}],"type":"conference","publication":"INTERSPEECH 2017, Stockholm, Schweden"}]
