@inproceedings{11838,
  abstract     = {{Distributed sensor data acquisition usually encompasses data sampling by the individual devices, where each of them has its own oscillator driving the local sampling process, resulting in slightly different sampling rates at the individual sensor nodes. Nevertheless, for certain downstream signal processing tasks it is important to compensate even for small sampling rate offsets. Aligning the sampling rates of oscillators which differ only by a few parts-per-million, is, however, challenging and quite different from traditional multirate signal processing tasks. In this paper we propose to transfer a precise but computationally demanding time domain approach, inspired by the Nyquist-Shannon sampling theorem, to an efficient frequency domain implementation. To this end a buffer control is employed which compensates for sampling offsets which are multiples of the sampling period, while a digital filter, realized by the wellknown Overlap-Save method, handles the fractional part of the sampling phase offset. With experiments on artificially misaligned data we investigate the parametrization, the efficiency, and the induced distortions of the proposed resampling method. It is shown that a favorable compromise between residual distortion and computational complexity is achieved, compared to other sampling rate offset compensation techniques.}},
  author       = {{Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{26th European Signal Processing Conference (EUSIPCO 2018)}},
  title        = {{{Efficient Sampling Rate Offset Compensation - An Overlap-Save Based Approach}}},
  year         = {{2018}},
}

@inproceedings{11876,
  abstract     = {{This paper describes the systems for the single-array track and the multiple-array track of the 5th CHiME Challenge. The final system is a combination of multiple systems, using Confusion Network Combination (CNC). The different systems presented here are utilizing different front-ends and training sets for a Bidirectional Long Short-Term Memory (BLSTM) Acoustic Model (AM). The front-end was replaced by enhancements provided by Paderborn University [1]. The back-end has been implemented using RASR [2] and RETURNN [3]. Additionally, a system combination including the hypothesis word graphs from the system of the submission [1] has been performed, which results in the final best system.}},
  author       = {{Kitza, Markus and Michel, Wilfried and Boeddeker, Christoph and Heitkaemper, Jens and Menne, Tobias and Schlüter, Ralf and Ney, Hermann and Schmalenstroeer, Joerg and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India}},
  title        = {{{The RWTH/UPB System Combination for the CHiME 2018 Workshop}}},
  year         = {{2018}},
}

@inproceedings{11836,
  abstract     = {{Due to their distributed nature wireless acoustic sensor networks offer great potential for improved signal acquisition, processing and classification for applications such as monitoring and surveillance, home automation, or hands-free telecommunication. To reduce the communication demand with a central server and to raise the privacy level it is desirable to perform processing at node level. The limited processing and memory capabilities on a sensor node, however, stand in contrast to the compute and memory intensive deep learning algorithms used in modern speech and audio processing. In this work, we perform benchmarking of commonly used convolutional and recurrent neural network architectures on a Raspberry Pi based acoustic sensor node. We show that it is possible to run medium-sized neural network topologies used for speech enhancement and speech recognition in real time. For acoustic event recognition, where predictions in a lower temporal resolution are sufficient, it is even possible to run current state-of-the-art deep convolutional models with a real-time-factor of 0:11.}},
  author       = {{Ebbers, Janek and Heitkaemper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG 2018, Oldenburg, Germany}},
  title        = {{{Benchmarking Neural Network Architectures for Acoustic Sensor Networks}}},
  year         = {{2018}},
}

@inproceedings{11839,
  abstract     = {{It has been experimentally verified that sampling rate offsets (SROs) between the input channels of an acoustic beamformer have a detrimental effect on the achievable SNR gains. In this paper we derive an analytic model to study the impact of SRO on the estimation of the spatial noise covariance matrix used in MVDR beamforming. It is shown that a perfect compensation of the SRO is impossible if the noise covariance matrix is estimated by time averaging, even if the SRO is perfectly known. The SRO should therefore be compensated for prior to beamformer coefficient estimation. We present a novel scheme where SRO compensation and beamforming closely interact, saving some computational effort compared to separate SRO adjustment followed by acoustic beamforming.}},
  author       = {{Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG 2018, Oldenburg, Germany}},
  title        = {{{Insights into the Interplay of Sampling Rate Offsets and MVDR Beamforming}}},
  year         = {{2018}},
}

@inproceedings{11717,
  abstract     = {{In this work, we address the limited availability of large annotated databases for real-life audio event detection by utilizing the concept of transfer learning. This technique aims to transfer knowledge from a source domain to a target domain, even if source and target have different feature distributions and label sets. We hypothesize that all acoustic events share the same inventory of basic acoustic building blocks and differ only in the temporal order of these acoustic units. We then construct a deep neural network with convolutional layers for extracting the acoustic units and a recurrent layer for capturing the temporal order. Under the above hypothesis, transfer learning from a source to a target domain with a different acoustic event inventory is realized by transferring the convolutional layers from the source to the target domain. The recurrent layer is, however, learnt directly from the target domain. Experiments on the transfer from a synthetic source database to the reallife target database of DCASE 2016 demonstrate that transfer learning leads to improved detection performance on average. However, the successful transfer to detect events which are very different from what was seen in the source domain, could not be verified.}},
  author       = {{Arora, Prerna and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE 19th International Workshop on Multimedia Signal Processing (MMSP)}},
  title        = {{{A Study on Transfer Learning for Acoustic Event Detection in a Real Life Scenario}}},
  year         = {{2017}},
}

@techreport{11735,
  abstract     = {{This report describes the computation of gradients by algorithmic differentiation for statistically optimum beamforming operations. Especially the derivation of complex-valued functions is a key component of this approach. Therefore the real-valued algorithmic differentiation is extended via the complex-valued chain rule. In addition to the basic mathematic operations the derivative of the eigenvalue problem with complex-valued eigenvectors is one of the key results of this report. The potential of this approach is shown with experimental results on the CHiME-3 challenge database. There, the beamforming task is used as a front-end for an ASR system. With the developed derivatives a joint optimization of a speech enhancement and speech recognition system w.r.t. the recognition optimization criterion is possible.}},
  author       = {{Boeddeker, Christoph and Hanebrink, Patrick and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  title        = {{{On the Computation of Complex-valued Gradients with Application to Statistically Optimum Beamforming}}},
  year         = {{2017}},
}

@inproceedings{11736,
  abstract     = {{In this paper we show how a neural network for spectral mask estimation for an acoustic beamformer can be optimized by algorithmic differentiation. Using the beamformer output SNR as the objective function to maximize, the gradient is propagated through the beamformer all the way to the neural network which provides the clean speech and noise masks from which the beamformer coefficients are estimated by eigenvalue decomposition. A key theoretical result is the derivative of an eigenvalue problem involving complex-valued eigenvectors. Experimental results on the CHiME-3 challenge database demonstrate the effectiveness of the approach. The tools developed in this paper are a key component for an end-to-end optimization of speech enhancement and speech recognition.}},
  author       = {{Boeddeker, Christoph and Hanebrink, Patrick and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}},
  title        = {{{Optimizing Neural-Network Supported Acoustic Beamforming by Algorithmic Differentiation}}},
  year         = {{2017}},
}

@inproceedings{11737,
  abstract     = {{The benefits of both a logarithmic spectral amplitude (LSA) estimation and a modeling in a generalized spectral domain (where short-time amplitudes are raised to a generalized power exponent, not restricted to magnitude or power spectrum) are combined in this contribution to achieve a better tradeoff between speech quality and noise suppression in single-channel speech enhancement. A novel gain function is derived to enhance the logarithmic generalized spectral amplitudes of noisy speech. Experiments on the CHiME-3 dataset show that it outperforms the famous minimum mean squared error (MMSE) LSA gain function of Ephraim and Malah in terms of noise suppression by 1.4 dB, while the good speech quality of the MMSE-LSA estimator is maintained.}},
  author       = {{Chinaev, Alleksej and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}},
  title        = {{{A Generalized Log-Spectral Amplitude Estimator for Single-Channel Speech Enhancement}}},
  year         = {{2017}},
}

@inproceedings{11754,
  abstract     = {{Recent advances in discriminatively trained mask estimation networks to extract a single source utilizing beamforming techniques demonstrate, that the integration of statistical models and deep neural networks (DNNs) are a promising approach for robust automatic speech recognition (ASR) applications. In this contribution we demonstrate how discriminatively trained embeddings on spectral features can be tightly integrated into statistical model-based source separation to separate and transcribe overlapping speech. Good generalization to unseen spatial configurations is achieved by estimating a statistical model at test time, while still leveraging discriminative training of deep clustering embeddings on a separate training set. We formulate an expectation maximization (EM) algorithm which jointly estimates a model for deep clustering embeddings and complex-valued spatial observations in the short time Fourier transform (STFT) domain at test time. Extensive simulations confirm, that the integrated model outperforms (a) a deep clustering model with a subsequent beamforming step and (b) an EM-based model with a beamforming step alone in terms of signal to distortion ratio (SDR) and perceptually motivated metric (PESQ) gains. ASR results on a reverberated dataset further show, that the aforementioned gains translate to reduced word error rates (WERs) even in reverberant environments.}},
  author       = {{Drude, Lukas and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2017, Stockholm, Schweden}},
  title        = {{{Tight integration of spatial and spectral features for BSS with Deep Clustering embeddings}}},
  year         = {{2017}},
}

@inproceedings{11770,
  abstract     = {{In this contribution we show how to exploit text data to support word discovery from audio input in an underresourced target language. Given audio, of which a certain amount is transcribed at the word level, and additional unrelated text data, the approach is able to learn a probabilistic mapping from acoustic units to characters and utilize it to segment the audio data into words without the need of a pronunciation dictionary. This is achieved by three components: an unsupervised acoustic unit discovery system, a supervisedly trained acoustic unit-to-grapheme converter, and a word discovery system, which is initialized with a language model trained on the text data. Experiments for multiple setups show that the initialization of the language model with text data improves the word segementation performance by a large margin.}},
  author       = {{Glarner, Thomas and Boenninghoff, Benedikt and Walter, Oliver and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2017, Stockholm, Schweden}},
  title        = {{{Leveraging Text Data for Word Segmentation for Underresourced Languages}}},
  year         = {{2017}},
}

@inproceedings{11809,
  abstract     = {{This paper presents an end-to-end training approach for a beamformer-supported multi-channel ASR system. A neural network which estimates masks for a statistically optimum beamformer is jointly trained with a network for acoustic modeling. To update its parameters, we propagate the gradients from the acoustic model all the way through feature extraction and the complex valued beamforming operation. Besides avoiding a mismatch between the front-end and the back-end, this approach also eliminates the need for stereo data, i.e., the parallel availability of clean and noisy versions of the signals. Instead, it can be trained with real noisy multichannel data only. Also, relying on the signal statistics for beamforming, the approach makes no assumptions on the configuration of the microphone array. We further observe a performance gain through joint training in terms of word error rate in an evaluation of the system on the CHiME 4 dataset.}},
  author       = {{Heymann, Jahn and Drude, Lukas and Boeddeker, Christoph and Hanebrink, Patrick and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}},
  title        = {{{BEAMNET: End-to-End Training of a Beamformer-Supported Multi-Channel ASR System}}},
  year         = {{2017}},
}

@article{11811,
  abstract     = {{Acoustic beamforming can greatly improve the performance of Automatic Speech Recognition (ASR) and speech enhancement systems when multiple channels are available. We recently proposed a way to support the model-based Generalized Eigenvalue beamforming operation with a powerful neural network for spectral mask estimation. The enhancement system has a number of desirable properties. In particular, neither assumptions need to be made about the nature of the acoustic transfer function (e.g., being anechonic), nor does the array configuration need to be known. While the system has been originally developed to enhance speech in noisy environments, we show in this article that it is also effective in suppressing reverberation, thus leading to a generic trainable multi-channel speech enhancement system for robust speech processing. To support this claim, we consider two distinct datasets: The CHiME 3 challenge, which features challenging real-world noise distortions, and the Reverb challenge, which focuses on distortions caused by reverberation. We evaluate the system both with respect to a speech enhancement and a recognition task. For the first task we propose a new way to cope with the distortions introduced by the Generalized Eigenvalue beamformer by renormalizing the target energy for each frequency bin, and measure its effectiveness in terms of the PESQ score. For the latter we feed the enhanced signal to a strong DNN back-end and achieve state-of-the-art ASR results on both datasets. We further experiment with different network architectures for spectral mask estimation: One small feed-forward network with only one hidden layer, one Convolutional Neural Network and one bi-directional Long Short-Term Memory network, showing that even a small network is capable of delivering significant performance improvements.}},
  author       = {{Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold}},
  journal      = {{Computer Speech and Language}},
  title        = {{{A Generic Neural Acoustic Beamforming Architecture for Robust Multi-Channel Speech Processing}}},
  year         = {{2017}},
}

@inproceedings{11763,
  abstract     = {{In this paper, we apply a high-resolution approach, i.e. the matrix pencil method (MPM), to the FMCW automotive radar system to separate the neighboring targets, which share similar parameters, i.e. range, relative speed and azimuth angle, and cause overlapping in the radar spectrum. In order to adapt the 1D model of MPM to the 2D range-velocity spectrum and simultaneously limit the computational cost, some preprocessing steps are proposed to construct a novel separation algorithm. Finally, this algorithm is evaluated in both simulation and real data, and the results indicate a promising performance.}},
  author       = {{Fei, Tai and Grimm, Christopher and Farhoud, Ridha and Breddermann, Tobias and Warsitz, Ernst and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE International conference on microwave, communications, anthenas and electronic systems}},
  title        = {{{A Novel Target Separation Algorithm Applied to The Two-Dimensional Spectrum for FMCW Automotive Radar Systems}}},
  year         = {{2017}},
}

@inproceedings{11772,
  abstract     = {{In this paper, we present a hypothesis test for the classification of moving targets in the sight of an automotive radar sensor. For this purpose, a statistical model of the relative velocity between a stationary target and the radar sensor has been developed. With respect to the statistical properties a confidence interval is calculated and targets with relative velocity lying outside this interval are classified as moving targets. Compared to existing algorithms our approach is able to give robust classification independent of the number of observed moving targets and is characterized by an instantaneous classification, a simple parameterization of the model and an automatic calculation of the discriminating threshold.}},
  author       = {{Grimm, Christopher and Breddermann, Tobias and Farhoud, Ridha and Fei, Tai and Warsitz, Ernst and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE International conference on microwave, communications, anthenas and electronic systems (COMCAS)}},
  title        = {{{Hypothesis Test for the Detection of Moving Targets in Automotive Radar}}},
  year         = {{2017}},
}

@inproceedings{11759,
  abstract     = {{Variational Autoencoders (VAEs) have been shown to provide efficient neural-network-based approximate Bayesian inference for observation models for which exact inference is intractable. Its extension, the so-called Structured VAE (SVAE) allows inference in the presence of both discrete and continuous latent variables. Inspired by this extension, we developed a VAE with Hidden Markov Models (HMMs) as latent models. We applied the resulting HMM-VAE to the task of acoustic unit discovery in a zero resource scenario. Starting from an initial model based on variational inference in an HMM with Gaussian Mixture Model (GMM) emission probabilities, the accuracy of the acoustic unit discovery could be significantly improved by the HMM-VAE. In doing so we were able to demonstrate for an unsupervised learning task what is well-known in the supervised learning case: Neural networks provide superior modeling power compared to GMMs.}},
  author       = {{Ebbers, Janek and Heymann, Jahn and Drude, Lukas and Glarner, Thomas and Haeb-Umbach, Reinhold and Raj, Bhiksha}},
  booktitle    = {{INTERSPEECH 2017, Stockholm, Schweden}},
  title        = {{{Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery}}},
  year         = {{2017}},
}

@inproceedings{11895,
  abstract     = {{Multi-channel speech enhancement algorithms rely on a synchronous sampling of the microphone signals. This, however, cannot always be guaranteed, especially if the sensors are distributed in an environment. To avoid performance degradation the sampling rate offset needs to be estimated and compensated for. In this contribution we extend the recently proposed coherence drift based method in two important directions. First, the increasing phase shift in the short-time Fourier transform domain is estimated from the coherence drift in a Matched Filterlike fashion, where intermediate estimates are weighted by their instantaneous SNR. Second, an observed bias is removed by iterating between offset estimation and compensation by resampling a couple of times. The effectiveness of the proposed method is demonstrated by speech recognition results on the output of a beamformer with and without sampling rate offset compensation between the input channels. We compare MVDR and maximum-SNR beamformers in reverberant environments and further show that both benefit from a novel phase normalization, which we also propose in this contribution.}},
  author       = {{Schmalenstroeer, Joerg and Heymann, Jahn and Drude, Lukas and Boeddeker, Christoph and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE 19th International Workshop on Multimedia Signal Processing (MMSP)}},
  title        = {{{Multi-Stage Coherence Drift Based Sampling Rate Synchronization for Acoustic Beamforming}}},
  year         = {{2017}},
}

@inproceedings{11773,
  abstract     = {{In this paper we present an algorithm for the detection of moving targets in sight of an automotive radar sensor which can handle distorted ego-velocity information. In situations where biased or none velocity information is provided from the ego-vehicle, the algorithm is able to estimate the ego-velocity based on previously detected stationary targets with high accuracy, subsequently used for the target classification. Compared to existing ego-velocity algorithms our approach provides fast and efficient inference without sacrificing the practical classification accuracy. Other than that the algorithm is characterized by simple parameterization and little but appropriate model assumptions for high accurate production automotive radar sensors.}},
  author       = {{Grimm, Christopher and Farhoud, Ridha and Fei, Tai and Warsitz, Ernst and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE Microwaves, Radar and Remote Sensing Symposium (MRRS)}},
  title        = {{{Detection of Moving Targets in Automotive Radar with Distorted Ego-Velocity Information}}},
  year         = {{2017}},
}

@inproceedings{11738,
  abstract     = {{In this contribution we investigate a priori signal-to-noise ratio (SNR) estimation, a crucial component of a single-channel speech enhancement system based on spectral subtraction. The majority of the state-of-the art a priori SNR estimators work in the power spectral domain, which is, however, not confirmed to be the optimal domain for the estimation. Motivated by the generalized spectral subtraction rule, we show how the estimation of the a priori SNR can be formulated in the so called generalized SNR domain. This formulation allows to generalize the widely used decision directed (DD) approach. An experimental investigation with different noise types reveals the superiority of the generalized DD approach over the conventional DD approach in terms of both the mean opinion score - listening quality objective measure and the output global SNR in the medium to high input SNR regime, while we show that the power spectrum is the optimal domain for low SNR. We further develop a parameterization which adjusts the domain of estimation automatically according to the estimated input global SNR. Index Terms: single-channel speech enhancement, a priori SNR estimation, generalized spectral subtraction}},
  author       = {{Chinaev, Aleksej and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2016, San Francisco, USA}},
  title        = {{{A Priori SNR Estimation Using a Generalized Decision Directed Approach}}},
  year         = {{2016}},
}

@inproceedings{11743,
  abstract     = {{This contribution introduces a novel causal a priori signal-to-noise ratio (SNR) estimator for single-channel speech enhancement. To exploit the advantages of the generalized spectral subtraction, a normalized ?-order magnitude (NAOM) domain is introduced where an a priori SNR estimation is carried out. In this domain, the NAOM coefficients of noise and clean speech signals are modeled by a Weibull distribution and aWeibullmixturemodel (WMM), respectively. While the parameters of the noise model are calculated from the noise power spectral density estimates, the speechWMM parameters are estimated from the noisy signal by applying a causal Expectation-Maximization algorithm. Further a maximum a posteriori estimate of the a priori SNR is developed. The experiments in different noisy environments show the superiority of the proposed estimator compared to the well-known decision-directed approach in terms of estimation error, estimator variance and speech quality of the enhanced signals when used for speech enhancement.}},
  author       = {{Chinaev, Aleksej and Heitkaemper, Jens and Haeb-Umbach, Reinhold}},
  booktitle    = {{12. ITG Fachtagung Sprachkommunikation (ITG 2016)}},
  title        = {{{A Priori SNR Estimation Using Weibull Mixture Model}}},
  year         = {{2016}},
}

@inproceedings{11744,
  abstract     = {{A noise power spectral density (PSD) estimation is an indispensable component of speech spectral enhancement systems. In this paper we present a noise PSD tracking algorithm, which employs a noise presence probability estimate delivered by a deep neural network (DNN). The algorithm provides a causal noise PSD estimate and can thus be used in speech enhancement systems for communication purposes. An extensive performance comparison has been carried out with ten causal state-of-the-art noise tracking algorithms taken from the literature and categorized acc. to applied techniques. The experiments showed that the proposed DNN-based noise PSD tracker outperforms all competing methods with respect to all tested performance measures, which include the noise tracking performance and the performance of a speech enhancement system employing the noise tracking component.}},
  author       = {{Chinaev, Aleksej and Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold}},
  booktitle    = {{12. ITG Fachtagung Sprachkommunikation (ITG 2016)}},
  title        = {{{Noise-Presence-Probability-Based Noise PSD Estimation by Using DNNs}}},
  year         = {{2016}},
}