@inproceedings{11965,
  abstract     = {{We present an unsupervised training approach for a neural network-based mask estimator in an acoustic beamforming application. The network is trained to maximize a likelihood criterion derived from a spatial mixture model of the observations. It is trained from scratch without requiring any parallel data consisting of degraded input and clean training targets. Thus, training can be carried out on real recordings of noisy speech rather than simulated ones. In contrast to previous work on unsupervised training of neural mask estimators, our approach avoids the need for a possibly pre-trained teacher model entirely. We demonstrate the effectiveness of our approach by speech recognition experiments on two different datasets: one mainly deteriorated by noise (CHiME 4) and one by reverberation (REVERB). The results show that the performance of the proposed system is on par with a supervised system using oracle target masks for training and with a system trained using a model-based teacher.}},
  author       = {{Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2019, Graz, Austria}},
  title        = {{{Unsupervised training of neural mask-based beamforming}}},
  year         = {{2019}},
}

@inproceedings{12875,
  abstract     = {{Signal dereverberation using the Weighted Prediction Error (WPE) method has been proven to be an effective means to raise the accuracy of far-field speech recognition. First proposed as an iterative algorithm, follow-up works have reformulated it as a recursive least squares algorithm and therefore enabled its use in online applications. For this algorithm, the estimation of the power spectral density (PSD) of the anechoic signal plays an important role and strongly influences its performance. Recently, we showed that using a neural network PSD estimator leads to improved performance for online automatic speech recognition. This, however, comes at a price. To train the network, we require parallel data, i.e., utterances simultaneously available in clean and reverberated form. Here we propose to overcome this limitation by training the network jointly with the acoustic model of the speech recognizer. To be specific, the gradients computed from the cross-entropy loss between the target senone sequence and the acoustic model network output is backpropagated through the complex-valued dereverberation filter estimation to the neural network for PSD estimation. Evaluation on two databases demonstrates improved performance for on-line processing scenarios while imposing fewer requirements on the available training data and thus widening the range of applications.}},
  author       = {{Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold and Kinoshita, Keisuke and Nakatani, Tomohiro}},
  booktitle    = {{ICASSP 2019, Brighton, UK}},
  title        = {{{Joint Optimization of Neural Network-based WPE Dereverberation and Acoustic Model for Robust Online ASR}}},
  year         = {{2019}},
}

@inproceedings{18107,
  author       = {{Heymann, Jahn and Bacchiani, M. and Sainath, T. N.}},
  booktitle    = {{2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  pages        = {{6722--6726}},
  title        = {{{Performance of Mask Based Statistical Beamforming in a Smart Home Scenario}}},
  doi          = {{10.1109/ICASSP.2018.8462372}},
  year         = {{2018}},
}

@inproceedings{11835,
  abstract     = {{Signal dereverberation using the weighted prediction error (WPE) method has been proven to be an effective means to raise the accuracy of far-field speech recognition. But in its original formulation, WPE requires multiple iterations over a sufficiently long utterance, rendering it unsuitable for online low-latency applications. Recently, two methods have been proposed to overcome this limitation. One utilizes a neural network to estimate the power spectral density (PSD) of the target signal and works in a block-online fashion. The other method relies on a rather simple PSD estimation which smoothes the observed PSD and utilizes a recursive formulation which enables it to work on a frame-by-frame basis. In this paper, we integrate a deep neural network (DNN) based estimator into the recursive frame-online formulation. We evaluate the performance of the recursive system with different PSD estimators in comparison to the block-online and offline variant on two distinct corpora. The REVERB challenge data, where the signal is mainly deteriorated by reverberation, and a database which combines WSJ and VoiceHome to also consider (directed) noise sources. The results show that although smoothing works surprisingly well, the more sophisticated DNN based estimator shows promising improvements and shortens the performance gap between online and offline processing.}},
  author       = {{Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold and Kinoshita, Keisuke and Nakatani, Tomohiro}},
  booktitle    = {{IWAENC 2018, Tokio, Japan}},
  title        = {{{Frame-Online DNN-WPE Dereverberation}}},
  year         = {{2018}},
}

@inproceedings{11837,
  abstract     = {{We present a block-online multi-channel front end for automatic speech recognition in noisy and reverberated environments. It is an online version of our earlier proposed neural network supported acoustic beamformer, whose coefficients are calculated from noise and speech spatial covariance matrices which are estimated utilizing a neural mask estimator. However, the sparsity of speech in the STFT domain causes problems for the initial beamformer coefficients estimation in some frequency bins due to lack of speech observations. We propose two methods to mitigate this issue. The first is to lower the frequency resolution of the STFT, which comes with the additional advantage of a reduced time window, thus lowering the latency introduced by block processing. The second approach is to smooth beamforming coefficients along the frequency axis, thus exploiting their high interfrequency correlation. With both approaches the gap between offline and block-online beamformer performance, as measured by the word error rate achieved by a downstream speech recognizer, is significantly reduced. Experiments are carried out on two copora, representing noisy (CHiME-4) and noisy reverberant (voiceHome) environments.}},
  author       = {{Heitkaemper, Jens and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG 2018, Oldenburg, Germany}},
  title        = {{{Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming}}},
  year         = {{2018}},
}

@inproceedings{11872,
  abstract     = {{The weighted prediction error (WPE) algorithm has proven to be a very successful dereverberation method for the REVERB challenge. Likewise, neural network based mask estimation for beamforming demonstrated very good noise suppression in the CHiME 3 and CHiME 4 challenges. Recently, it has been shown that this estimator can also be trained to perform dereverberation and denoising jointly. However, up to now a comparison of a neural beamformer and WPE is still missing, so is an investigation into a combination of the two. Therefore, we here provide an extensive evaluation of both and consequently propose variants to integrate deep neural network based beamforming with WPE. For these integrated variants we identify a consistent word error rate (WER) reduction on two distinct databases. In particular, our study shows that deep learning based beamforming benefits from a model-based dereverberation technique (i.e. WPE) and vice versa. Our key findings are: (a) Neural beamforming yields the lower WERs in comparison to WPE the more channels and noise are present. (b) Integration of WPE and a neural beamformer consistently outperforms all stand-alone systems.}},
  author       = {{Drude, Lukas and Boeddeker, Christoph and Heymann, Jahn and Kinoshita, Keisuke and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2018, Hyderabad, India}},
  title        = {{{Integration neural network based beamforming and weighted prediction error dereverberation}}},
  year         = {{2018}},
}

@inproceedings{11873,
  abstract     = {{NARA-WPE is a Python software package providing implementations of the weighted prediction error (WPE) dereverberation algorithm. WPE has been shown to be a highly effective tool for speech dereverberation, thus improving the perceptual quality of the signal and improving the recognition performance of downstream automatic speech recognition (ASR). It is suitable both for single-channel and multi-channel applications. The package consist of (1) a Numpy implementation which can easily be integrated into a custom Python toolchain, and (2) a TensorFlow implementation which allows integration into larger computational graphs and enables backpropagation through WPE to train more advanced front-ends. This package comprises of an iterative offline (batch) version, a block-online version, and a frame-online version which can be used in moderately low latency applications, e.g. digital speech assistants.}},
  author       = {{Drude, Lukas and Heymann, Jahn and Boeddeker, Christoph and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG 2018, Oldenburg, Germany}},
  title        = {{{NARA-WPE: A Python package for weighted prediction error dereverberation in Numpy and Tensorflow for online and offline processing}}},
  year         = {{2018}},
}

@inproceedings{29923,
  abstract     = {{This paper introduces a new open source platform for end-toend speech processing named ESPnet. ESPnet mainly focuses on end-to-end automatic speech recognition (ASR), and adopts widely-used dynamic neural network toolkits, Chainer and Py-Torch, as a main deep learning engine. ESPnet also follows the Kaldi ASR toolkit style for data processing, feature extraction/format, and recipes to provide a complete setup for speech recognition and other speech processing experiments. This paper explains a major architecture of this software platform, several important functionalities, which differentiate ESPnet from other open source ASR toolkits, and experimental results with
major ASR benchmarks.}},
  author       = {{Watanabe, Shinji and Hori, Takaaki and Karita, Shigeki and Hayashi, Tomoki and Nishitoba, Jiro and Unno, Yuya and Enrique Yalta Soplin, Nelson and Heymann, Jahn and Wiesner, Matthew and Chen, Nanxin and Renduchintala, Adithya and Ochiai, Tsubasa}},
  booktitle    = {{INTERSPEECH 2018, Hyderabad, India}},
  pages        = {{2207–2211}},
  title        = {{{ESPnet: End-to-End Speech Processing Toolkit}}},
  doi          = {{10.21437/Interspeech.2018-1456}},
  year         = {{2018}},
}

@inproceedings{11876,
  abstract     = {{This paper describes the systems for the single-array track and the multiple-array track of the 5th CHiME Challenge. The final system is a combination of multiple systems, using Confusion Network Combination (CNC). The different systems presented here are utilizing different front-ends and training sets for a Bidirectional Long Short-Term Memory (BLSTM) Acoustic Model (AM). The front-end was replaced by enhancements provided by Paderborn University [1]. The back-end has been implemented using RASR [2] and RETURNN [3]. Additionally, a system combination including the hypothesis word graphs from the system of the submission [1] has been performed, which results in the final best system.}},
  author       = {{Kitza, Markus and Michel, Wilfried and Boeddeker, Christoph and Heitkaemper, Jens and Menne, Tobias and Schlüter, Ralf and Ney, Hermann and Schmalenstroeer, Joerg and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. CHiME 2018 Workshop on Speech Processing in Everyday Environments, Hyderabad, India}},
  title        = {{{The RWTH/UPB System Combination for the CHiME 2018 Workshop}}},
  year         = {{2018}},
}

@techreport{11735,
  abstract     = {{This report describes the computation of gradients by algorithmic differentiation for statistically optimum beamforming operations. Especially the derivation of complex-valued functions is a key component of this approach. Therefore the real-valued algorithmic differentiation is extended via the complex-valued chain rule. In addition to the basic mathematic operations the derivative of the eigenvalue problem with complex-valued eigenvectors is one of the key results of this report. The potential of this approach is shown with experimental results on the CHiME-3 challenge database. There, the beamforming task is used as a front-end for an ASR system. With the developed derivatives a joint optimization of a speech enhancement and speech recognition system w.r.t. the recognition optimization criterion is possible.}},
  author       = {{Boeddeker, Christoph and Hanebrink, Patrick and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  title        = {{{On the Computation of Complex-valued Gradients with Application to Statistically Optimum Beamforming}}},
  year         = {{2017}},
}

@inproceedings{11736,
  abstract     = {{In this paper we show how a neural network for spectral mask estimation for an acoustic beamformer can be optimized by algorithmic differentiation. Using the beamformer output SNR as the objective function to maximize, the gradient is propagated through the beamformer all the way to the neural network which provides the clean speech and noise masks from which the beamformer coefficients are estimated by eigenvalue decomposition. A key theoretical result is the derivative of an eigenvalue problem involving complex-valued eigenvectors. Experimental results on the CHiME-3 challenge database demonstrate the effectiveness of the approach. The tools developed in this paper are a key component for an end-to-end optimization of speech enhancement and speech recognition.}},
  author       = {{Boeddeker, Christoph and Hanebrink, Patrick and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}},
  title        = {{{Optimizing Neural-Network Supported Acoustic Beamforming by Algorithmic Differentiation}}},
  year         = {{2017}},
}

@inproceedings{11809,
  abstract     = {{This paper presents an end-to-end training approach for a beamformer-supported multi-channel ASR system. A neural network which estimates masks for a statistically optimum beamformer is jointly trained with a network for acoustic modeling. To update its parameters, we propagate the gradients from the acoustic model all the way through feature extraction and the complex valued beamforming operation. Besides avoiding a mismatch between the front-end and the back-end, this approach also eliminates the need for stereo data, i.e., the parallel availability of clean and noisy versions of the signals. Instead, it can be trained with real noisy multichannel data only. Also, relying on the signal statistics for beamforming, the approach makes no assumptions on the configuration of the microphone array. We further observe a performance gain through joint training in terms of word error rate in an evaluation of the system on the CHiME 4 dataset.}},
  author       = {{Heymann, Jahn and Drude, Lukas and Boeddeker, Christoph and Hanebrink, Patrick and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}},
  title        = {{{BEAMNET: End-to-End Training of a Beamformer-Supported Multi-Channel ASR System}}},
  year         = {{2017}},
}

@article{11811,
  abstract     = {{Acoustic beamforming can greatly improve the performance of Automatic Speech Recognition (ASR) and speech enhancement systems when multiple channels are available. We recently proposed a way to support the model-based Generalized Eigenvalue beamforming operation with a powerful neural network for spectral mask estimation. The enhancement system has a number of desirable properties. In particular, neither assumptions need to be made about the nature of the acoustic transfer function (e.g., being anechonic), nor does the array configuration need to be known. While the system has been originally developed to enhance speech in noisy environments, we show in this article that it is also effective in suppressing reverberation, thus leading to a generic trainable multi-channel speech enhancement system for robust speech processing. To support this claim, we consider two distinct datasets: The CHiME 3 challenge, which features challenging real-world noise distortions, and the Reverb challenge, which focuses on distortions caused by reverberation. We evaluate the system both with respect to a speech enhancement and a recognition task. For the first task we propose a new way to cope with the distortions introduced by the Generalized Eigenvalue beamformer by renormalizing the target energy for each frequency bin, and measure its effectiveness in terms of the PESQ score. For the latter we feed the enhanced signal to a strong DNN back-end and achieve state-of-the-art ASR results on both datasets. We further experiment with different network architectures for spectral mask estimation: One small feed-forward network with only one hidden layer, one Convolutional Neural Network and one bi-directional Long Short-Term Memory network, showing that even a small network is capable of delivering significant performance improvements.}},
  author       = {{Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold}},
  journal      = {{Computer Speech and Language}},
  title        = {{{A Generic Neural Acoustic Beamforming Architecture for Robust Multi-Channel Speech Processing}}},
  year         = {{2017}},
}

@inproceedings{11759,
  abstract     = {{Variational Autoencoders (VAEs) have been shown to provide efficient neural-network-based approximate Bayesian inference for observation models for which exact inference is intractable. Its extension, the so-called Structured VAE (SVAE) allows inference in the presence of both discrete and continuous latent variables. Inspired by this extension, we developed a VAE with Hidden Markov Models (HMMs) as latent models. We applied the resulting HMM-VAE to the task of acoustic unit discovery in a zero resource scenario. Starting from an initial model based on variational inference in an HMM with Gaussian Mixture Model (GMM) emission probabilities, the accuracy of the acoustic unit discovery could be significantly improved by the HMM-VAE. In doing so we were able to demonstrate for an unsupervised learning task what is well-known in the supervised learning case: Neural networks provide superior modeling power compared to GMMs.}},
  author       = {{Ebbers, Janek and Heymann, Jahn and Drude, Lukas and Glarner, Thomas and Haeb-Umbach, Reinhold and Raj, Bhiksha}},
  booktitle    = {{INTERSPEECH 2017, Stockholm, Schweden}},
  title        = {{{Hidden Markov Model Variational Autoencoder for Acoustic Unit Discovery}}},
  year         = {{2017}},
}

@inproceedings{11895,
  abstract     = {{Multi-channel speech enhancement algorithms rely on a synchronous sampling of the microphone signals. This, however, cannot always be guaranteed, especially if the sensors are distributed in an environment. To avoid performance degradation the sampling rate offset needs to be estimated and compensated for. In this contribution we extend the recently proposed coherence drift based method in two important directions. First, the increasing phase shift in the short-time Fourier transform domain is estimated from the coherence drift in a Matched Filterlike fashion, where intermediate estimates are weighted by their instantaneous SNR. Second, an observed bias is removed by iterating between offset estimation and compensation by resampling a couple of times. The effectiveness of the proposed method is demonstrated by speech recognition results on the output of a beamformer with and without sampling rate offset compensation between the input channels. We compare MVDR and maximum-SNR beamformers in reverberant environments and further show that both benefit from a novel phase normalization, which we also propose in this contribution.}},
  author       = {{Schmalenstroeer, Joerg and Heymann, Jahn and Drude, Lukas and Boeddeker, Christoph and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE 19th International Workshop on Multimedia Signal Processing (MMSP)}},
  title        = {{{Multi-Stage Coherence Drift Based Sampling Rate Synchronization for Acoustic Beamforming}}},
  year         = {{2017}},
}

@inproceedings{11744,
  abstract     = {{A noise power spectral density (PSD) estimation is an indispensable component of speech spectral enhancement systems. In this paper we present a noise PSD tracking algorithm, which employs a noise presence probability estimate delivered by a deep neural network (DNN). The algorithm provides a causal noise PSD estimate and can thus be used in speech enhancement systems for communication purposes. An extensive performance comparison has been carried out with ten causal state-of-the-art noise tracking algorithms taken from the literature and categorized acc. to applied techniques. The experiments showed that the proposed DNN-based noise PSD tracker outperforms all competing methods with respect to all tested performance measures, which include the noise tracking performance and the performance of a speech enhancement system employing the noise tracking component.}},
  author       = {{Chinaev, Aleksej and Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold}},
  booktitle    = {{12. ITG Fachtagung Sprachkommunikation (ITG 2016)}},
  title        = {{{Noise-Presence-Probability-Based Noise PSD Estimation by Using DNNs}}},
  year         = {{2016}},
}

@inproceedings{11812,
  author       = {{Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}},
  title        = {{{Neural Network Based Spectral Mask Estimation for Acoustic Beamforming}}},
  year         = {{2016}},
}

@inproceedings{11834,
  abstract     = {{We present a system for the 4th CHiME challenge which significantly increases the performance for all three tracks with respect to the provided baseline system. The front-end uses a bi-directional Long Short-Term Memory (BLSTM)-based neural network to estimate signal statistics. These then steer a Generalized Eigenvalue beamformer. The back-end consists of a 22 layer deep Wide Residual Network and two extra BLSTM layers. Working on a whole utterance instead of frames allows us to refine Batch-Normalization. We also train our own BLSTM-based language model. Adding a discriminative speaker adaptation leads to further gains. The final system achieves a word error rate on the six channel real test data of 3.48%. For the two channel track we achieve 5.96% and for the one channel track 9.34%. This is the best reported performance on the challenge achieved by a single system, i.e., a configuration, which does not combine multiple systems. At the same time, our system is independent of the microphone configuration. We can thus use the same components for all three tracks.}},
  author       = {{Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold}},
  booktitle    = {{Computer Speech and Language}},
  title        = {{{Wide Residual BLSTM Network with Discriminative Speaker Adaptation for Robust Speech Recognition}}},
  year         = {{2016}},
}

@inproceedings{11908,
  abstract     = {{This paper describes automatic speech recognition (ASR) systems developed jointly by RWTH, UPB and FORTH for the 1ch, 2ch and 6ch track of the 4th CHiME Challenge. In the 2ch and 6ch tracks the final system output is obtained by a Confusion Network Combination (CNC) of multiple systems. The Acoustic Model (AM) is a deep neural network based on Bidirectional Long Short-Term Memory (BLSTM) units. The systems differ by front ends and training sets used for the acoustic training. The model for the 1ch track is trained without any preprocessing. For each front end we trained and evaluated individual acoustic models. We compare the ASR performance of different beamforming approaches: a conventional superdirective beamformer [1] and an MVDR beamformer as in [2], where the steering vector is estimated based on [3]. Furthermore we evaluated a BLSTM supported Generalized Eigenvalue beamformer using NN-GEV [4]. The back end is implemented using RWTH?s open-source toolkits RASR [5], RETURNN [6] and rwthlm [7]. We rescore lattices with a Long Short-Term Memory (LSTM) based language model. The overall best results are obtained by a system combination that includes the lattices from the system of UPB?s submission [8]. Our final submission scored second in each of the three tracks of the 4th CHiME Challenge.}},
  author       = {{Menne, Tobias and Heymann, Jahn and Alexandridis, Anastasios and Irie, Kazuki and Zeyer, Albert and Kitza, Markus and Golik, Pavel and Kulikov, Ilia and Drude, Lukas and Schlüter, Ralf and Ney, Hermann and Haeb-Umbach, Reinhold and Mouchtaris, Athanasios}},
  booktitle    = {{Computer Speech and Language}},
  title        = {{{The RWTH/UPB/FORTH System Combination for the 4th CHiME Challenge Evaluation}}},
  year         = {{2016}},
}

@inproceedings{11810,
  author       = {{Heymann, Jahn and Drude, Lukas and Chinaev, Aleksej and Haeb-Umbach, Reinhold}},
  booktitle    = {{Automatic Speech Recognition and Understanding Workshop (ASRU 2015)}},
  title        = {{{BLSTM supported GEV Beamformer Front-End for the 3RD CHiME Challenge}}},
  year         = {{2015}},
}

