@inproceedings{33471,
  abstract     = {{The intelligibility of demodulated audio signals from analog high frequency transmissions, e.g., using single-sideband
(SSB) modulation, can be severely degraded by channel distortions and/or a mismatch between modulation and demodulation carrier frequency. In this work a neural network (NN)-based approach for carrier frequency offset (CFO) estimation from demodulated SSB signals is proposed, whereby a task specific architecture is presented. Additionally, a simulation framework for SSB signals is introduced and utilized for training the NNs. The CFO estimator is combined with a speech enhancement network to investigate its influence on the enhancement performance. The NN-based system is compared to a recently proposed pitch tracking based approach on publicly available data from real high frequency transmissions. Experiments show that the NN exhibits good CFO estimation properties and results in significant improvements in speech intelligibility, especially when combined with a noise reduction network.}},
  author       = {{Heitkämper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proceedings of the 30th European Signal Processing Conference (EUSIPCO)}},
  location     = {{Belgrad}},
  title        = {{{Neural Network Based Carrier Frequency Offset Estimation From Speech Transmitted Over High Frequency Channels}}},
  year         = {{2022}},
}

@inproceedings{33808,
  author       = {{Gburrek, Tobias and Schmalenstroeer, Joerg and Heitkaemper, Jens and Haeb-Umbach, Reinhold}},
  booktitle    = {{2022 International Workshop on Acoustic Signal Enhancement (IWAENC)}},
  location     = {{ Bamberg, Germany }},
  publisher    = {{IEEE}},
  title        = {{{Informed vs. Blind Beamforming in Ad-Hoc Acoustic Sensor Networks for Meeting Transcription}}},
  doi          = {{10.1109/IWAENC53105.2022.9914772}},
  year         = {{2022}},
}

@inproceedings{24000,
  author       = {{Heitkaemper, Jens and Schmalenstroeer, Joerg and Ion, Valentin and Haeb-Umbach, Reinhold}},
  booktitle    = {{Speech Communication; 14th ITG-Symposium}},
  pages        = {{1--5}},
  title        = {{{A Database for Research on Detection and Enhancement of Speech Transmitted over HF links}}},
  year         = {{2021}},
}

@inproceedings{23998,
  author       = {{Schmalenstroeer, Joerg and Heitkaemper, Jens and Ullmann, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{29th European Signal Processing Conference (EUSIPCO)}},
  pages        = {{1--5}},
  title        = {{{Open Range Pitch Tracking for Carrier Frequency Difference Estimation from HF Transmitted Speech}}},
  year         = {{2021}},
}

@inproceedings{20700,
  author       = {{Boeddeker, Christoph and Cord-Landwehr, Tobias and Heitkaemper, Jens and Zorila, Catalin and Hayakawa, Daichi and Li, Mohan and Liu, Min and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. CHiME 2020 Workshop on Speech Processing in Everyday Environments}},
  title        = {{{Towards a speaker diarization system for the CHiME 2020 dinner party transcription}}},
  year         = {{2020}},
}

@inproceedings{20701,
  abstract     = {{This paper describes Asteroid , the PyTorch -based audio source separation toolkit for researchers. Inspired by the most successful neural source separation systems, it provides all neural building blocks required to build such a system. To improve reproducibility, Kaldi-style recipes on common audio source separation datasets are also provided. This paper describes the software architecture of Asteroid and its most important features. By showing experimental results obtained with Asteroid ’s recipes, we show that our implementations are at least on par with most results reported in reference papers. The toolkit is publicly available at github.com/mpariente/asteroid.}},
  author       = {{Pariente, Manuel and Cornell, Samuele and Cosentino, Joris and Sivasankaran, Sunit and Tzinis, Efthymios and Heitkaemper, Jens and Olvera, Michel and Stöter, Fabian-Robert and Hu, Mathieu and Martín-Doñas, Juan M. and Ditter, David and Frank, Ariel and Deleforge, Antoine and Vincent, Emmanuel}},
  booktitle    = {{Interspeech 2020}},
  title        = {{{Asteroid: The PyTorch-Based Audio Source Separation Toolkit for Researchers}}},
  doi          = {{10.21437/interspeech.2020-1673}},
  year         = {{2020}},
}

@inproceedings{20504,
  abstract     = {{In recent years time domain speech separation has excelled over frequency domain separation in single channel scenarios and noise-free environments. In this paper we dissect the gains of the time-domain audio separation network (TasNet) approach by gradually replacing components of an utterance-level permutation invariant training (u-PIT) based separation system in the frequency domain until the TasNet system is reached, thus blending components of frequency domain approaches with those of time domain approaches. Some of the intermediate variants achieve comparable signal-to-distortion ratio (SDR) gains to TasNet, but retain the advantage of frequency domain processing: compatibility with classic signal processing tools such as frequency-domain beamforming and the human interpretability of the masks. Furthermore, we show that the scale invariant signal-to-distortion ratio (si-SDR) criterion used as loss function in TasNet is related to a logarithmic mean square error criterion and that it is this criterion which contributes most reliable to the performance advantage of TasNet. Finally, we critically assess which gains in a noise-free single channel environment generalize to more realistic reverberant conditions.}},
  author       = {{Heitkaemper, Jens and Jakobeit, Darius and Boeddeker, Christoph and Drude, Lukas and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2020 Virtual Barcelona Spain}},
  keywords     = {{voice activity detection, speech activity detection, neural network, statistical speech processing}},
  title        = {{{Demystifying TasNet: A Dissecting Approach}}},
  year         = {{2020}},
}

@inproceedings{20505,
  abstract     = {{Speech activity detection (SAD), which often rests on the fact that the noise is "more'' stationary than speech, is particularly challenging in non-stationary environments, because the time variance of the acoustic scene makes it difficult to discriminate  speech from noise. We propose two approaches to SAD, where one is based on statistical signal processing, while the other utilizes neural networks. The former employs sophisticated signal processing to track the noise and speech energies and is meant to support the case for a resource efficient, unsupervised signal processing approach.
The latter introduces a recurrent network layer that operates on short segments of the input speech to do temporal smoothing in the presence of non-stationary noise. The systems are tested on the Fearless Steps challenge database, which consists of the transmission data from the Apollo-11 space mission.
The statistical SAD  achieves comparable detection performance to earlier proposed neural network based SADs, while the neural network based approach leads to a decision cost function of 1.07% on the evaluation set of the 2020 Fearless Steps Challenge, which sets a new state of the art.}},
  author       = {{Heitkaemper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2020 Virtual Shanghai China}},
  keywords     = {{voice activity detection, speech activity detection, neural network, statistical speech processing}},
  title        = {{{Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments}}},
  year         = {{2020}},
}

@article{19446,
  abstract     = {{We present a multi-channel database of overlapping speech for training, evaluation, and detailed analysis of source separation and extraction algorithms: SMS-WSJ -- Spatialized Multi-Speaker Wall Street Journal. It consists of artificially mixed speech taken from the WSJ database, but unlike earlier databases we consider all WSJ0+1 utterances and take care of strictly separating the speaker sets present in the training, validation and test sets. When spatializing the data we ensure a high degree of randomness w.r.t. room size, array center and rotation, as well as speaker position. Furthermore, this paper offers a critical assessment of recently proposed measures of source separation performance. Alongside the code to generate the database we provide a source separation baseline and a Kaldi recipe with competitive word error rates to provide common ground for evaluation.}},
  author       = {{Drude, Lukas and Heitkaemper, Jens and Boeddeker, Christoph and Haeb-Umbach, Reinhold}},
  journal      = {{ArXiv e-prints}},
  title        = {{{SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition}}},
  year         = {{2019}},
}

@inproceedings{14822,
  abstract     = {{Multi-talker speech and moving speakers still pose a significant challenge to automatic speech recognition systems. Assuming an enrollment utterance of the target speakeris available, the so-called SpeakerBeam concept has been recently proposed to extract the target speaker from a speech mixture. If multi-channel input is available, spatial properties of the speaker can be exploited to support the source extraction. In this contribution we investigate different approaches to exploit such spatial information. In particular, we are interested in the question, how useful this information is if the target speaker changes his/her position. To this end, we present a SpeakerBeam-based source extraction network that is adapted to work on moving speakers by recursively updating the beamformer coefficients. Experimental results are presented on two data sets, one with articially created room impulse responses, and one with real room impulse responses and noise recorded in a conference room. Interestingly, spatial features turn out to be advantageous even if the speaker position changes.}},
  author       = {{Heitkaemper, Jens and Feher, Thomas and Freitag, Michael and Haeb-Umbach, Reinhold}},
  booktitle    = {{International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia}},
  title        = {{{A Study on Online Source Extraction in the Presence of Changing Speaker Positions}}},
  year         = {{2019}},
}

@inproceedings{14824,
  abstract     = {{This paper deals with multi-channel speech recognition in scenarios with multiple speakers. Recently, the spectral characteristics of a target speaker, extracted from an adaptation utterance, have been used to guide a neural network mask estimator to focus on that speaker. In this work we present two variants of speakeraware neural networks, which exploit both spectral and spatial information to allow better discrimination between target and interfering speakers. Thus, we introduce either a spatial preprocessing prior to the mask estimation or a spatial plus spectral speaker characterization block whose output is directly fed into the neural mask estimator. The target speaker’s spectral and spatial signature is extracted from an adaptation utterance recorded at the beginning of a session. We further adapt the architecture for low-latency processing by means of block-online beamforming that recursively updates the signal statistics. Experimental results show that the additional spatial information clearly improves source extraction, in particular in the same-gender case, and that our proposal achieves state-of-the-art performance in terms of distortion reduction and recognition accuracy.}},
  author       = {{Martin-Donas, Juan M. and Heitkaemper, Jens and Haeb-Umbach, Reinhold and Gomez, Angel M. and Peinado, Antonio M.}},
  booktitle    = {{INTERSPEECH 2019, Graz, Austria}},
  title        = {{{Multi-Channel Block-Online Source Extraction based on Utterance Adaptation}}},
  year         = {{2019}},
}

@inproceedings{14826,
  abstract     = {{In this paper, we present Hitachi and Paderborn University’s joint effort for automatic speech recognition (ASR) in a dinner party scenario. The main challenges of ASR systems for dinner party recordings obtained by multiple microphone arrays are (1) heavy speech overlaps, (2) severe noise and reverberation, (3) very natural onversational content, and possibly (4) insufficient training data. As an example of a dinner party scenario, we have chosen the data presented during the CHiME-5 speech recognition challenge, where the baseline ASR had a 73.3% word error rate (WER), and even the best performing system at the CHiME-5 challenge had a 46.1% WER. We extensively investigated a combination of the guided source separation-based speech enhancement technique and an already proposed strong ASR backend and found that a tight combination of these techniques provided substantial accuracy improvements. Our final system achieved WERs of 39.94% and 41.64% for the development and evaluation data, respectively, both of which are the best published results for the dataset. We also investigated with additional training data on the official small data in the CHiME-5 corpus to assess the intrinsic difficulty of this ASR task.}},
  author       = {{Kanda, Naoyuki and Boeddeker, Christoph and Heitkaemper, Jens and Fujita, Yusuke and Horiguchi, Shota and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2019, Graz, Austria}},
  title        = {{{Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR}}},
  year         = {{2019}},
}

@inproceedings{11837,
  abstract     = {{We present a block-online multi-channel front end for automatic speech recognition in noisy and reverberated environments. It is an online version of our earlier proposed neural network supported acoustic beamformer, whose coefficients are calculated from noise and speech spatial covariance matrices which are estimated utilizing a neural mask estimator. However, the sparsity of speech in the STFT domain causes problems for the initial beamformer coefficients estimation in some frequency bins due to lack of speech observations. We propose two methods to mitigate this issue. The first is to lower the frequency resolution of the STFT, which comes with the additional advantage of a reduced time window, thus lowering the latency introduced by block processing. The second approach is to smooth beamforming coefficients along the frequency axis, thus exploiting their high interfrequency correlation. With both approaches the gap between offline and block-online beamformer performance, as measured by the word error rate achieved by a downstream speech recognizer, is significantly reduced. Experiments are carried out on two copora, representing noisy (CHiME-4) and noisy reverberant (voiceHome) environments.}},
  author       = {{Heitkaemper, Jens and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG 2018, Oldenburg, Germany}},
  title        = {{{Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming}}},
  year         = {{2018}},
}

@inproceedings{12899,
  abstract     = {{This contribution presents a speech enhancement system for the CHiME-5 Dinner Party Scenario. The front-end employs multi-channel linear time-variant filtering and achieves its gains without the use of a neural network. We present an adaptation of blind source separation techniques to the CHiME-5 database which we call Guided Source Separation (GSS). Using the baseline acoustic and language model, the combination of Weighted Prediction Error based dereverberation, guided source separation, and beamforming reduces the WER by 10:54% (relative) for the single array track and by 21:12% (relative) on the multiple array track.}},
  author       = {{Boeddeker, Christoph and Heitkaemper, Jens and Schmalenstroeer, Joerg and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India}},
  title        = {{{Front-End Processing for the CHiME-5 Dinner Party Scenario}}},
  year         = {{2018}},
}

@inproceedings{11876,
  abstract     = {{This paper describes the systems for the single-array track and the multiple-array track of the 5th CHiME Challenge. The final system is a combination of multiple systems, using Confusion Network Combination (CNC). The different systems presented here are utilizing different front-ends and training sets for a Bidirectional Long Short-Term Memory (BLSTM) Acoustic Model (AM). The front-end was replaced by enhancements provided by Paderborn University [1]. The back-end has been implemented using RASR [2] and RETURNN [3]. Additionally, a system combination including the hypothesis word graphs from the system of the submission [1] has been performed, which results in the final best system.}},
  author       = {{Kitza, Markus and Michel, Wilfried and Boeddeker, Christoph and Heitkaemper, Jens and Menne, Tobias and Schlüter, Ralf and Ney, Hermann and Schmalenstroeer, Joerg and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India}},
  title        = {{{The RWTH/UPB System Combination for the CHiME 2018 Workshop}}},
  year         = {{2018}},
}

@inproceedings{11836,
  abstract     = {{Due to their distributed nature wireless acoustic sensor networks offer great potential for improved signal acquisition, processing and classification for applications such as monitoring and surveillance, home automation, or hands-free telecommunication. To reduce the communication demand with a central server and to raise the privacy level it is desirable to perform processing at node level. The limited processing and memory capabilities on a sensor node, however, stand in contrast to the compute and memory intensive deep learning algorithms used in modern speech and audio processing. In this work, we perform benchmarking of commonly used convolutional and recurrent neural network architectures on a Raspberry Pi based acoustic sensor node. We show that it is possible to run medium-sized neural network topologies used for speech enhancement and speech recognition in real time. For acoustic event recognition, where predictions in a lower temporal resolution are sufficient, it is even possible to run current state-of-the-art deep convolutional models with a real-time-factor of 0:11.}},
  author       = {{Ebbers, Janek and Heitkaemper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG 2018, Oldenburg, Germany}},
  title        = {{{Benchmarking Neural Network Architectures for Acoustic Sensor Networks}}},
  year         = {{2018}},
}

@inproceedings{11743,
  abstract     = {{This contribution introduces a novel causal a priori signal-to-noise ratio (SNR) estimator for single-channel speech enhancement. To exploit the advantages of the generalized spectral subtraction, a normalized ?-order magnitude (NAOM) domain is introduced where an a priori SNR estimation is carried out. In this domain, the NAOM coefficients of noise and clean speech signals are modeled by a Weibull distribution and aWeibullmixturemodel (WMM), respectively. While the parameters of the noise model are calculated from the noise power spectral density estimates, the speechWMM parameters are estimated from the noisy signal by applying a causal Expectation-Maximization algorithm. Further a maximum a posteriori estimate of the a priori SNR is developed. The experiments in different noisy environments show the superiority of the proposed estimator compared to the well-known decision-directed approach in terms of estimation error, estimator variance and speech quality of the enhanced signals when used for speech enhancement.}},
  author       = {{Chinaev, Aleksej and Heitkaemper, Jens and Haeb-Umbach, Reinhold}},
  booktitle    = {{12. ITG Fachtagung Sprachkommunikation (ITG 2016)}},
  title        = {{{A Priori SNR Estimation Using Weibull Mixture Model}}},
  year         = {{2016}},
}

