@inproceedings{59999,
  author       = {{Rautenberg, Frederik and Kuhlmann, Michael and Seebauer, Fritz and Wiechmann, Jana and Wagner, Petra and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  location     = {{Hyderabad, India }},
  publisher    = {{IEEE}},
  title        = {{{Speech Synthesis along Perceptual Voice Quality Dimensions}}},
  doi          = {{10.1109/icassp49660.2025.10888012}},
  year         = {{2025}},
}

@inproceedings{61047,
  author       = {{Rautenberg, Frederik and Seebauer, Fritz and Wiechmann, Jana and Kuhlmann, Michael and Wagner, Petra and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2025}},
  location     = {{Rotterdam}},
  publisher    = {{ISCA}},
  title        = {{{Synthesizing Speech with Selected Perceptual Voice Qualities – A Case Study with Creaky Voice}}},
  doi          = {{10.21437/Interspeech.2025-1443}},
  year         = {{2025}},
}

@inproceedings{57099,
  author       = {{Xie, Yuying and Kuhlmann, Michael and Rautenberg, Frederik and Tan, Zheng-Hua and Häb-Umbach, Reinhold}},
  booktitle    = {{2024 32nd European Signal Processing Conference (EUSIPCO)}},
  pages        = {{436–440}},
  title        = {{{Speaker and Style Disentanglement of Speech Based on Contrastive Predictive Coding Supported Factorized Variational Autoencoder}}},
  year         = {{2024}},
}

@inproceedings{48355,
  abstract     = {{Unsupervised speech disentanglement aims at separating fast varying from
slowly varying components of a speech signal. In this contribution, we take a
closer look at the embedding vector representing the slowly varying signal
components, commonly named the speaker embedding vector. We ask, which
properties of a speaker's voice are captured and investigate to which extent do
individual embedding vector components sign responsible for them, using the
concept of Shapley values. Our findings show that certain speaker-specific
acoustic-phonetic properties can be fairly well predicted from the speaker
embedding, while the investigated more abstract voice quality features cannot.}},
  author       = {{Rautenberg, Frederik and Kuhlmann, Michael and Wiechmann, Jana and Seebauer, Fritz and Wagner, Petra and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG Conference on Speech Communication}},
  location     = {{Aachen}},
  title        = {{{On Feature Importance and Interpretability of Speaker Representations}}},
  year         = {{2023}},
}

@inproceedings{48410,
  author       = {{Wiechmann, Jana and Rautenberg, Frederik and Wagner, Petra and Haeb-Umbach, Reinhold}},
  booktitle    = {{20th International Congress of the Phonetic Sciences (ICPhS) }},
  title        = {{{Explaining voice characteristics to novice voice practitioners-How successful is it?}}},
  year         = {{2023}},
}

@inproceedings{44849,
  author       = {{Rautenberg, Frederik and Kuhlmann, Michael and Ebbers, Janek and Wiechmann, Jana and Seebauer, Fritz and Wagner, Petra and Haeb-Umbach, Reinhold}},
  booktitle    = {{Fortschritte der Akustik - DAGA 2023}},
  location     = {{Hamburg}},
  pages        = {{1409--1412}},
  title        = {{{Speech Disentanglement for Analysis and Modification of Acoustic and Perceptual Speaker Characteristics}}},
  year         = {{2023}},
}

@inproceedings{33696,
  author       = {{Wiechmann, Jana and Glarner, Thomas and Rautenberg, Frederik and Wagner, Petra and Haeb-Umbach, Reinhold}},
  booktitle    = {{18. Phonetik und Phonologie im deutschsprachigen Raum (P&P)}},
  location     = {{Bielefeld}},
  title        = {{{Technically enabled explaining of voice characteristics}}},
  year         = {{2022}},
}

@inproceedings{44843,
  abstract     = {{Unsupervised blind source separation methods do not require a training phase
and thus cannot suffer from a train-test mismatch, which is a common concern in
neural network based source separation. The unsupervised techniques can be
categorized in two classes, those building upon the sparsity of speech in the
Short-Time Fourier transform domain and those exploiting non-Gaussianity or
non-stationarity of the source signals. In this contribution, spatial mixture
models which fall in the first category and independent vector analysis (IVA)
as a representative of the second category are compared w.r.t. their separation
performance and the performance of a downstream speech recognizer on a
reverberant dataset of reasonable size. Furthermore, we introduce a serial
concatenation of the two, where the result of the mixture model serves as
initialization of IVA, which achieves significantly better WER performance than
each algorithm individually and even approaches the performance of a much more
complex neural network based technique.}},
  author       = {{Boeddeker, Christoph and Rautenberg, Frederik and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG Conference on Speech Communication}},
  location     = {{Kiel}},
  title        = {{{A Comparison and Combination of Unsupervised Blind Source Separation  Techniques}}},
  year         = {{2021}},
}

