[{"publication":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","abstract":[{"lang":"eng","text":"We propose an approach for simultaneous diarization and separation of meeting data. It consists of a complex Angular Central Gaussian Mixture Model (cACGMM) for speech source separation, and a von-Mises-Fisher Mixture Model (VMFMM) for diarization in a joint statistical framework. Through the integration, both spatial and spectral information are exploited for diarization and separation. We also develop a method for counting the number of active speakers in a segment of a meeting to support block-wise processing. While the total number of speakers in a meeting may be known, it is usually not known on a per-segment level. With the proposed speaker counting, joint diarization and source separation can be done segment-by-segment, and the permutation problem across segments is solved, thus allowing for block-online processing in the future. Experimental results on the LibriCSS meeting corpus show that the integrated approach outperforms a cascaded approach of diarization and speech enhancement in terms of WER, both on a per-segment and on a per-meeting level."}],"file":[{"file_size":259907,"file_id":"60930","file_name":"main.pdf","access_level":"closed","date_updated":"2025-08-14T08:11:57Z","creator":"cord","date_created":"2025-08-14T08:11:57Z","success":1,"relation":"main_file","content_type":"application/pdf"}],"ddc":["000"],"keyword":["diarization","source separation","mixture model","meeting"],"language":[{"iso":"eng"}],"year":"2024","date_created":"2024-11-14T09:32:38Z","title":"Simultaneous Diarization and Separation of Meetings through the Integration of Statistical Mixture Models","type":"conference","status":"public","project":[{"name":"PC2: Computing Resources Provided by the Paderborn Center for Parallel Computing","_id":"52"},{"name":"Automatische Transkription von Gesprächssituationen","_id":"508"}],"_id":"57085","user_id":"44393","department":[{"_id":"54"}],"file_date_updated":"2025-08-14T08:11:57Z","has_accepted_license":"1","citation":{"ieee":"T. Cord-Landwehr, C. Boeddeker, and R. Haeb-Umbach, “Simultaneous Diarization and Separation of Meetings through the Integration of Statistical Mixture Models,” presented at the 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Hyderabad, India, 2024, doi: <a href=\"https://doi.org/10.1109/ICASSP49660.2025.10888445\">10.1109/ICASSP49660.2025.10888445</a>.","chicago":"Cord-Landwehr, Tobias, Christoph Boeddeker, and Reinhold Haeb-Umbach. “Simultaneous Diarization and Separation of Meetings through the Integration of Statistical Mixture Models.” In <i>ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2024. <a href=\"https://doi.org/10.1109/ICASSP49660.2025.10888445\">https://doi.org/10.1109/ICASSP49660.2025.10888445</a>.","ama":"Cord-Landwehr T, Boeddeker C, Haeb-Umbach R. Simultaneous Diarization and Separation of Meetings through the Integration of Statistical Mixture Models. In: <i>ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. ; 2024. doi:<a href=\"https://doi.org/10.1109/ICASSP49660.2025.10888445\">10.1109/ICASSP49660.2025.10888445</a>","apa":"Cord-Landwehr, T., Boeddeker, C., &#38; Haeb-Umbach, R. (2024). Simultaneous Diarization and Separation of Meetings through the Integration of Statistical Mixture Models. <i>ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>. 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Hyderabad, India. <a href=\"https://doi.org/10.1109/ICASSP49660.2025.10888445\">https://doi.org/10.1109/ICASSP49660.2025.10888445</a>","mla":"Cord-Landwehr, Tobias, et al. “Simultaneous Diarization and Separation of Meetings through the Integration of Statistical Mixture Models.” <i>ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</i>, 2024, doi:<a href=\"https://doi.org/10.1109/ICASSP49660.2025.10888445\">10.1109/ICASSP49660.2025.10888445</a>.","bibtex":"@inproceedings{Cord-Landwehr_Boeddeker_Haeb-Umbach_2024, title={Simultaneous Diarization and Separation of Meetings through the Integration of Statistical Mixture Models}, DOI={<a href=\"https://doi.org/10.1109/ICASSP49660.2025.10888445\">10.1109/ICASSP49660.2025.10888445</a>}, booktitle={ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={Cord-Landwehr, Tobias and Boeddeker, Christoph and Haeb-Umbach, Reinhold}, year={2024} }","short":"T. Cord-Landwehr, C. Boeddeker, R. Haeb-Umbach, in: ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2024."},"date_updated":"2025-08-14T08:12:22Z","oa":"1","author":[{"id":"44393","full_name":"Cord-Landwehr, Tobias","last_name":"Cord-Landwehr","first_name":"Tobias"},{"id":"40767","full_name":"Boeddeker, Christoph","last_name":"Boeddeker","first_name":"Christoph"},{"id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach","first_name":"Reinhold"}],"main_file_link":[{"open_access":"1","url":"https://arxiv.org/pdf/2410.21455"}],"doi":"10.1109/ICASSP49660.2025.10888445","conference":{"location":"Hyderabad, India","name":"2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"}},{"file_date_updated":"2023-11-22T07:58:49Z","user_id":"460","department":[{"_id":"54"}],"_id":"49109","status":"public","type":"conference","conference":{"start_date":"2023-10-31","name":"57th Asilomar Conference on Signals, Systems, and Computers","end_date":"2023-11-01"},"author":[{"full_name":"Gburrek, Tobias","id":"44006","last_name":"Gburrek","first_name":"Tobias"},{"last_name":"Schmalenstroeer","id":"460","full_name":"Schmalenstroeer, Joerg","first_name":"Joerg"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"oa":"1","date_updated":"2023-11-22T07:58:49Z","citation":{"ama":"Gburrek T, Schmalenstroeer J, Haeb-Umbach R. Spatial Diarization for Meeting Transcription with Ad-Hoc Acoustic Sensor Networks. In: <i>Proc. Asilomar Conference on Signals, Systems, and Computers</i>. ; 2023.","chicago":"Gburrek, Tobias, Joerg Schmalenstroeer, and Reinhold Haeb-Umbach. “Spatial Diarization for Meeting Transcription with Ad-Hoc Acoustic Sensor Networks.” In <i>Proc. Asilomar Conference on Signals, Systems, and Computers</i>, 2023.","ieee":"T. Gburrek, J. Schmalenstroeer, and R. Haeb-Umbach, “Spatial Diarization for Meeting Transcription with Ad-Hoc Acoustic Sensor Networks,” presented at the 57th Asilomar Conference on Signals, Systems, and Computers, 2023.","bibtex":"@inproceedings{Gburrek_Schmalenstroeer_Haeb-Umbach_2023, title={Spatial Diarization for Meeting Transcription with Ad-Hoc Acoustic Sensor Networks}, booktitle={Proc. Asilomar Conference on Signals, Systems, and Computers}, author={Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}, year={2023} }","mla":"Gburrek, Tobias, et al. “Spatial Diarization for Meeting Transcription with Ad-Hoc Acoustic Sensor Networks.” <i>Proc. Asilomar Conference on Signals, Systems, and Computers</i>, 2023.","short":"T. Gburrek, J. Schmalenstroeer, R. Haeb-Umbach, in: Proc. Asilomar Conference on Signals, Systems, and Computers, 2023.","apa":"Gburrek, T., Schmalenstroeer, J., &#38; Haeb-Umbach, R. (2023). Spatial Diarization for Meeting Transcription with Ad-Hoc Acoustic Sensor Networks. <i>Proc. Asilomar Conference on Signals, Systems, and Computers</i>. 57th Asilomar Conference on Signals, Systems, and Computers."},"has_accepted_license":"1","language":[{"iso":"eng"}],"ddc":["004"],"keyword":["Diarization","time difference of arrival","ad-hoc acoustic sensor network","meeting transcription"],"file":[{"creator":"schmalen","date_created":"2023-11-22T07:51:18Z","date_updated":"2023-11-22T07:58:49Z","file_name":"asilomar.pdf","access_level":"open_access","file_id":"49110","file_size":212317,"content_type":"application/pdf","relation":"main_file"}],"abstract":[{"text":"We propose a diarization system, that estimates “who spoke when” based on spatial information, to be used as a front-end of a meeting transcription system running on the signals gathered from an acoustic sensor network (ASN). Although the\r\nspatial distribution of the microphones is advantageous, exploiting the spatial diversity for diarization and signal enhancement is challenging, because the microphones’ positions are typically unknown, and the recorded signals are initially unsynchronized in general. Here, we approach these issues by first blindly synchronizing the signals and then estimating time differences of arrival (TDOAs). The TDOA information is exploited to estimate the speakers’ activity, even in the presence of multiple speakers being simultaneously active. This speaker activity information serves as a guide for a spatial mixture model, on which basis the individual speaker’s signals are extracted via beamforming. Finally, the extracted signals are forwarded to a speech recognizer. Additionally, a novel initialization scheme for spatial mixture models based on the TDOA estimates is proposed. Experiments conducted on real recordings from the LibriWASN data set have shown that our proposed system is advantageous compared to a system using a spatial mixture model, which does not make use\r\nof external diarization information.","lang":"eng"}],"publication":"Proc. Asilomar Conference on Signals, Systems, and Computers","title":"Spatial Diarization for Meeting Transcription with Ad-Hoc Acoustic Sensor Networks","date_created":"2023-11-22T07:52:29Z","year":"2023","quality_controlled":"1"},{"keyword":["Accuracy","Acoustics","Estimation","Mathematical model","Soruce separation","Speech","Vectors","Bayes methods","Blind source separation","Directional statistics","Number of speakers","Speaker diarization"],"language":[{"iso":"eng"}],"_id":"11753","user_id":"44006","department":[{"_id":"54"}],"abstract":[{"text":"This contribution describes a step-wise source counting algorithm to determine the number of speakers in an offline scenario. Each speaker is identified by a variational expectation maximization (VEM) algorithm for complex Watson mixture models and therefore directly yields beamforming vectors for a subsequent speech separation process. An observation selection criterion is proposed which improves the robustness of the source counting in noise. The algorithm is compared to an alternative VEM approach with Gaussian mixture models based on directions of arrival and shown to deliver improved source counting accuracy. The article concludes by extending the offline algorithm towards a low-latency online estimation of the number of active sources from the streaming input data.","lang":"eng"}],"status":"public","type":"conference","publication":"14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)","title":"Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models","main_file_link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2014/DrChTrHaeb14.pdf","open_access":"1"}],"date_updated":"2022-01-06T06:51:08Z","oa":"1","date_created":"2019-07-12T05:27:35Z","author":[{"first_name":"Lukas","last_name":"Drude","id":"11213","full_name":"Drude, Lukas"},{"first_name":"Aleksej","last_name":"Chinaev","full_name":"Chinaev, Aleksej"},{"full_name":"Tran Vu, Dang Hai","last_name":"Tran Vu","first_name":"Dang Hai"},{"last_name":"Haeb-Umbach","full_name":"Haeb-Umbach, Reinhold","id":"242","first_name":"Reinhold"}],"year":"2014","citation":{"bibtex":"@inproceedings{Drude_Chinaev_Tran Vu_Haeb-Umbach_2014, title={Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models}, booktitle={14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)}, author={Drude, Lukas and Chinaev, Aleksej and Tran Vu, Dang Hai and Haeb-Umbach, Reinhold}, year={2014}, pages={213–217} }","short":"L. Drude, A. Chinaev, D.H. Tran Vu, R. Haeb-Umbach, in: 14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014), 2014, pp. 213–217.","mla":"Drude, Lukas, et al. “Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models.” <i>14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)</i>, 2014, pp. 213–17.","apa":"Drude, L., Chinaev, A., Tran Vu, D. H., &#38; Haeb-Umbach, R. (2014). Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models. In <i>14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)</i> (pp. 213–217).","ieee":"L. Drude, A. Chinaev, D. H. Tran Vu, and R. Haeb-Umbach, “Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models,” in <i>14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)</i>, 2014, pp. 213–217.","chicago":"Drude, Lukas, Aleksej Chinaev, Dang Hai Tran Vu, and Reinhold Haeb-Umbach. “Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models.” In <i>14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)</i>, 213–17, 2014.","ama":"Drude L, Chinaev A, Tran Vu DH, Haeb-Umbach R. Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models. In: <i>14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)</i>. ; 2014:213-217."},"page":"213-217","related_material":{"link":[{"url":"https://groups.uni-paderborn.de/nt/pubs/2014/DrChTrHaeb14_Poster.pdf","relation":"supplementary_material","description":"Poster"}]}},{"keyword":["audio streaming","audio visual data streaming","context information speech","face identification","face recognition","image segmentation","middleware","multimodal telecommunication","online diarization","service oriented middleware architecture","sessionless telecommunication","software architecture","speaker identification","speaker localization","speaker recognition","steerable camera","telecommunication computing","temporal segmentation","terminal-less telecommunication","video streaming"],"language":[{"iso":"eng"}],"_id":"11892","department":[{"_id":"54"}],"user_id":"460","abstract":[{"lang":"eng","text":"For an environment to be perceived as being smart, contextual information has to be gathered to adapt the system's behavior and its interface towards the user. Being a rich source of context information speech can be acquired unobtrusively by microphone arrays and then processed to extract information about the user and his environment. In this paper, a system for joint temporal segmentation, speaker localization, and identification is presented, which is supported by face identification from video data obtained from a steerable camera. Special attention is paid to latency aspects and online processing capabilities, as they are important for the application under investigation, namely ambient communication. It describes the vision of terminal-less, session-less and multi-modal telecommunication with remote partners, where the user can move freely within his home while the communication follows him. The speaker diarization serves as a context source, which has been integrated in a service-oriented middleware architecture and provided to the application to select the most appropriate I/O device and to steer the camera towards the speaker during ambient communication."}],"status":"public","publication":"IEEE Journal of Selected Topics in Signal Processing","type":"journal_article","title":"Online Diarization of Streaming Audio-Visual Data for Smart Environments","doi":"10.1109/JSTSP.2010.2050519","main_file_link":[{"open_access":"1","url":"https://groups.uni-paderborn.de/nt/pubs/2010/ScHa10.pdf"}],"date_updated":"2023-10-26T08:10:18Z","oa":"1","volume":4,"author":[{"last_name":"Schmalenstroeer","full_name":"Schmalenstroeer, Joerg","id":"460","first_name":"Joerg"},{"first_name":"Reinhold","id":"242","full_name":"Haeb-Umbach, Reinhold","last_name":"Haeb-Umbach"}],"date_created":"2019-07-12T05:30:16Z","year":"2010","page":"845-856","intvolume":"         4","citation":{"ama":"Schmalenstroeer J, Haeb-Umbach R. Online Diarization of Streaming Audio-Visual Data for Smart Environments. <i>IEEE Journal of Selected Topics in Signal Processing</i>. 2010;4(5):845-856. doi:<a href=\"https://doi.org/10.1109/JSTSP.2010.2050519\">10.1109/JSTSP.2010.2050519</a>","ieee":"J. Schmalenstroeer and R. Haeb-Umbach, “Online Diarization of Streaming Audio-Visual Data for Smart Environments,” <i>IEEE Journal of Selected Topics in Signal Processing</i>, vol. 4, no. 5, pp. 845–856, 2010, doi: <a href=\"https://doi.org/10.1109/JSTSP.2010.2050519\">10.1109/JSTSP.2010.2050519</a>.","chicago":"Schmalenstroeer, Joerg, and Reinhold Haeb-Umbach. “Online Diarization of Streaming Audio-Visual Data for Smart Environments.” <i>IEEE Journal of Selected Topics in Signal Processing</i> 4, no. 5 (2010): 845–56. <a href=\"https://doi.org/10.1109/JSTSP.2010.2050519\">https://doi.org/10.1109/JSTSP.2010.2050519</a>.","apa":"Schmalenstroeer, J., &#38; Haeb-Umbach, R. (2010). Online Diarization of Streaming Audio-Visual Data for Smart Environments. <i>IEEE Journal of Selected Topics in Signal Processing</i>, <i>4</i>(5), 845–856. <a href=\"https://doi.org/10.1109/JSTSP.2010.2050519\">https://doi.org/10.1109/JSTSP.2010.2050519</a>","short":"J. Schmalenstroeer, R. Haeb-Umbach, IEEE Journal of Selected Topics in Signal Processing 4 (2010) 845–856.","bibtex":"@article{Schmalenstroeer_Haeb-Umbach_2010, title={Online Diarization of Streaming Audio-Visual Data for Smart Environments}, volume={4}, DOI={<a href=\"https://doi.org/10.1109/JSTSP.2010.2050519\">10.1109/JSTSP.2010.2050519</a>}, number={5}, journal={IEEE Journal of Selected Topics in Signal Processing}, author={Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}, year={2010}, pages={845–856} }","mla":"Schmalenstroeer, Joerg, and Reinhold Haeb-Umbach. “Online Diarization of Streaming Audio-Visual Data for Smart Environments.” <i>IEEE Journal of Selected Topics in Signal Processing</i>, vol. 4, no. 5, 2010, pp. 845–56, doi:<a href=\"https://doi.org/10.1109/JSTSP.2010.2050519\">10.1109/JSTSP.2010.2050519</a>."},"quality_controlled":"1","issue":"5"}]
