@inproceedings{57031,
  author       = {{Gburrek, Tobias and Meise, Adrian and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{2024 18th International Workshop on Acoustic Signal Enhancement (IWAENC)}},
  publisher    = {{IEEE}},
  title        = {{{Diminishing Domain Mismatch for DNN-Based Acoustic Distance Estimation via Stochastic Room Reverberation Models}}},
  doi          = {{10.1109/iwaenc61483.2024.10694103}},
  year         = {{2024}},
}

@inproceedings{48269,
  author       = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{European Signal Processing Conference (EUSIPCO)}},
  location     = {{Helsinki}},
  title        = {{{On the Integration of Sampling Rate Synchronization and Acoustic Beamforming}}},
  year         = {{2023}},
}

@inproceedings{48270,
  author       = {{Schmalenstroeer, Joerg and Gburrek, Tobias and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG Conference on Speech Communication}},
  location     = {{Aachen}},
  title        = {{{LibriWASN: A Data Set for Meeting Separation, Diarization, and Recognition with Asynchronous Recording Devices}}},
  year         = {{2023}},
}

@inproceedings{49109,
  abstract     = {{We propose a diarization system, that estimates “who spoke when” based on spatial information, to be used as a front-end of a meeting transcription system running on the signals gathered from an acoustic sensor network (ASN). Although the
spatial distribution of the microphones is advantageous, exploiting the spatial diversity for diarization and signal enhancement is challenging, because the microphones’ positions are typically unknown, and the recorded signals are initially unsynchronized in general. Here, we approach these issues by first blindly synchronizing the signals and then estimating time differences of arrival (TDOAs). The TDOA information is exploited to estimate the speakers’ activity, even in the presence of multiple speakers being simultaneously active. This speaker activity information serves as a guide for a spatial mixture model, on which basis the individual speaker’s signals are extracted via beamforming. Finally, the extracted signals are forwarded to a speech recognizer. Additionally, a novel initialization scheme for spatial mixture models based on the TDOA estimates is proposed. Experiments conducted on real recordings from the LibriWASN data set have shown that our proposed system is advantageous compared to a system using a spatial mixture model, which does not make use
of external diarization information.}},
  author       = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. Asilomar Conference on Signals, Systems, and Computers}},
  keywords     = {{Diarization, time difference of arrival, ad-hoc acoustic sensor network, meeting transcription}},
  title        = {{{Spatial Diarization for Meeting Transcription with Ad-Hoc Acoustic Sensor Networks}}},
  year         = {{2023}},
}

@inproceedings{33806,
  author       = {{Afifi, Haitham and Karl, Holger and Gburrek, Tobias and Schmalenstroeer, Joerg}},
  booktitle    = {{2022 International Wireless Communications and Mobile Computing (IWCMC)}},
  publisher    = {{IEEE}},
  title        = {{{Data-driven Time Synchronization in Wireless Multimedia Networks}}},
  doi          = {{10.1109/iwcmc55113.2022.9824980}},
  year         = {{2022}},
}

@inproceedings{33807,
  author       = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  publisher    = {{IEEE}},
  title        = {{{On Synchronization of Wireless Acoustic Sensor Networks in the Presence of Time-Varying Sampling Rate Offsets and Speaker Changes}}},
  doi          = {{10.1109/icassp43922.2022.9746284}},
  year         = {{2022}},
}

@inproceedings{33808,
  author       = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Heitkaemper, Jens and Haeb-Umbach, Reinhold}},
  booktitle    = {{2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}},
  location     = {{ Bamberg, Germany }},
  publisher    = {{IEEE}},
  title        = {{{Informed vs. Blind Beamforming in Ad-Hoc Acoustic Sensor Networks for Meeting Transcription}}},
  doi          = {{10.1109/IWAENC53105.2022.9914772}},
  year         = {{2022}},
}

@misc{33816,
  author       = {{Gburrek, Tobias and Boeddeker, Christoph and von Neumann, Thilo and Cord-Landwehr, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  publisher    = {{arXiv}},
  title        = {{{A Meeting Transcription System for an Ad-Hoc Acoustic Sensor Network}}},
  doi          = {{10.48550/ARXIV.2205.00944}},
  year         = {{2022}},
}

@article{22528,
  abstract     = {{Due to the ad hoc nature of wireless acoustic sensor networks, the position of the sensor nodes is typically unknown. This contribution proposes a technique to estimate the position and orientation of the sensor nodes from the recorded speech signals. The method assumes that a node comprises a microphone array with synchronously sampled microphones rather than a single microphone, but does not require the sampling clocks of the nodes to be synchronized. From the observed audio signals, the distances between the acoustic sources and arrays, as well as the directions of arrival, are estimated. They serve as input to a non-linear least squares problem, from which both the sensor nodes’ positions and orientations, as well as the source positions, are alternatingly estimated in an iterative process. Given one set of unknowns, i.e., either the source positions or the sensor nodes’ geometry, the other set of unknowns can be computed in closed-form. The proposed approach is computationally efficient and the first one, which employs both distance and directional information for geometry calibration in a common cost function. Since both distance and direction of arrival measurements suffer from outliers, e.g., caused by strong reflections of the sound waves on the surfaces of the room, we introduce measures to deemphasize or remove unreliable measurements. Additionally, we discuss modifications of our previously proposed deep neural network-based acoustic distance estimator, to account not only for omnidirectional sources but also for directional sources. Simulation results show good positioning accuracy and compare very favorably with alternative approaches from the literature.}},
  author       = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  issn         = {{1687-4722}},
  journal      = {{EURASIP Journal on Audio, Speech, and Music Processing}},
  title        = {{{Geometry calibration in wireless acoustic sensor networks utilizing DoA and distance information}}},
  doi          = {{10.1186/s13636-021-00210-x}},
  year         = {{2021}},
}

@inproceedings{23994,
  author       = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  title        = {{{Iterative Geometry Calibration from Distance Estimates for Wireless Acoustic Sensor Networks}}},
  doi          = {{10.1109/icassp39728.2021.9413831}},
  year         = {{2021}},
}

@inproceedings{23999,
  author       = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{Speech Communication; 14th ITG-Symposium}},
  pages        = {{1--5}},
  title        = {{{On Source-Microphone Distance Estimation Using Convolutional Recurrent Neural Networks}}},
  year         = {{2021}},
}

@inproceedings{23997,
  author       = {{Chinaev, Aleksej and Enzner, Gerald and Gburrek, Tobias and Schmalenstroeer, Joerg}},
  booktitle    = {{29th European Signal Processing Conference (EUSIPCO)}},
  pages        = {{1--5}},
  title        = {{{Online Estimation of Sampling Rate Offsets in Wireless Acoustic Sensor Networks with Packet Loss}}},
  year         = {{2021}},
}

@inproceedings{18651,
  abstract     = {{We present an approach to deep neural network based (DNN-based) distance estimation in reverberant rooms for supporting geometry calibration tasks in wireless acoustic sensor networks. Signal diffuseness information from acoustic signals is aggregated via the coherent-to-diffuse power ratio to obtain a distance-related feature, which is mapped to a source-to-microphone distance estimate by means of a DNN. This information is then combined with direction-of-arrival estimates from compact microphone arrays to infer the geometry of the sensor network. Unlike many other approaches to geometry calibration, the proposed scheme does only require that the sampling clocks of the sensor nodes are roughly synchronized. In simulations we show that the proposed DNN-based distance estimator generalizes to unseen acoustic environments and that precise estimates of the sensor node positions are obtained. }},
  author       = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Brendel, Andreas and Kellermann, Walter and Haeb-Umbach, Reinhold}},
  booktitle    = {{European Signal Processing Conference (EUSIPCO)}},
  title        = {{{Deep Neural Network based Distance Estimation for Geometry Calibration in Acoustic Sensor Network}}},
  year         = {{2020}},
}

@inproceedings{15237,
  abstract     = {{This  paper  presents  an  approach  to  voice  conversion,  whichdoes neither require parallel data nor speaker or phone labels fortraining.  It can convert between speakers which are not in thetraining set by employing the previously proposed concept of afactorized hierarchical variational autoencoder. Here, linguisticand speaker induced variations are separated upon the notionthat content induced variations change at a much shorter timescale, i.e., at the segment level, than speaker induced variations,which vary at the longer utterance level. In this contribution wepropose to employ convolutional instead of recurrent networklayers  in  the  encoder  and  decoder  blocks,  which  is  shown  toachieve better phone recognition accuracy on the latent segmentvariables at frame-level due to their better temporal resolution.For voice conversion the mean of the utterance variables is re-placed with the respective estimated mean of the target speaker.The resulting log-mel spectra of the decoder output are used aslocal conditions of a WaveNet which is utilized for synthesis ofthe speech waveforms.  Experiments show both good disentan-glement properties of the latent space variables, and good voiceconversion performance.}},
  author       = {{Gburrek, Tobias and Glarner, Thomas and Ebbers, Janek and Haeb-Umbach, Reinhold and Wagner, Petra}},
  booktitle    = {{Proc. 10th ISCA Speech Synthesis Workshop}},
  location     = {{Vienna}},
  pages        = {{81--86}},
  title        = {{{Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion}}},
  doi          = {{10.21437/SSW.2019-15}},
  year         = {{2019}},
}