@article{12890,
  abstract     = {{We formulate a generic framework for blind source separation (BSS), which allows integrating data-driven spectro-temporal methods, such as deep clustering and deep attractor networks, with physically motivated probabilistic spatial methods, such as complex angular central Gaussian mixture models. The integrated model exploits the complementary strengths of the two approaches to BSS: the strong modeling power of neural networks, which, however, is based on supervised learning, and the ease of unsupervised learning of the spatial mixture models whose few parameters can be estimated on as little as a single segment of a real mixture of speech. Experiments are carried out on both artificially mixed speech and true recordings of speech mixtures. The experiments verify that the integrated models consistently outperform the individual components. We further extend the models to cope with noisy, reverberant speech and introduce a cross-domain teacher–student training where the mixture model serves as the teacher to provide training targets for the student neural network.}},
  author       = {{Drude, Lukas and Haeb-Umbach, Reinhold}},
  issn         = {{1941-0484}},
  journal      = {{IEEE Journal of Selected Topics in Signal Processing}},
  title        = {{{Integration of Neural Networks and Probabilistic Spatial Models for Acoustic Blind Source Separation}}},
  doi          = {{10.1109/JSTSP.2019.2912565}},
  year         = {{2019}},
}

@inproceedings{15816,
  abstract     = {{Despite the strong modeling power of neural network acoustic models, speech enhancement has been shown to deliver additional word error rate improvements if multi-channel data is available. However, there has been a longstanding debate whether enhancement should also be carried out on the ASR training data. In an extensive experimental evaluation on the acoustically very challenging CHiME-5 dinner party data we show that: (i) cleaning up the training data can lead to substantial error rate reductions, and (ii) enhancement in training is advisable as long as enhancement in test is at least as strong as in training. This approach stands in contrast and delivers larger gains than the common strategy reported in the literature to augment the training database with additional artificially degraded speech. Together with an acoustic model topology consisting of initial CNN layers followed by factorized TDNN layers we achieve with 41.6% and 43.2% WER on the DEV and EVAL test sets, respectively, a new single-system state-of-the-art result on the CHiME-5 data. This is a 8% relative improvement compared to the best word error rate published so far for a speech recognizer without system combination.}},
  author       = {{Zorila, Catalin and Boeddeker, Christoph and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{ASRU 2019, Sentosa, Singapore}},
  title        = {{{An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription}}},
  year         = {{2019}},
}

@inproceedings{14822,
  abstract     = {{Multi-talker speech and moving speakers still pose a significant challenge to automatic speech recognition systems. Assuming an enrollment utterance of the target speakeris available, the so-called SpeakerBeam concept has been recently proposed to extract the target speaker from a speech mixture. If multi-channel input is available, spatial properties of the speaker can be exploited to support the source extraction. In this contribution we investigate different approaches to exploit such spatial information. In particular, we are interested in the question, how useful this information is if the target speaker changes his/her position. To this end, we present a SpeakerBeam-based source extraction network that is adapted to work on moving speakers by recursively updating the beamformer coefficients. Experimental results are presented on two data sets, one with articially created room impulse responses, and one with real room impulse responses and noise recorded in a conference room. Interestingly, spatial features turn out to be advantageous even if the speaker position changes.}},
  author       = {{Heitkaemper, Jens and Feher, Thomas and Freitag, Michael and Haeb-Umbach, Reinhold}},
  booktitle    = {{International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia}},
  title        = {{{A Study on Online Source Extraction in the Presence of Changing Speaker Positions}}},
  year         = {{2019}},
}

@inproceedings{14824,
  abstract     = {{This paper deals with multi-channel speech recognition in scenarios with multiple speakers. Recently, the spectral characteristics of a target speaker, extracted from an adaptation utterance, have been used to guide a neural network mask estimator to focus on that speaker. In this work we present two variants of speakeraware neural networks, which exploit both spectral and spatial information to allow better discrimination between target and interfering speakers. Thus, we introduce either a spatial preprocessing prior to the mask estimation or a spatial plus spectral speaker characterization block whose output is directly fed into the neural mask estimator. The target speaker’s spectral and spatial signature is extracted from an adaptation utterance recorded at the beginning of a session. We further adapt the architecture for low-latency processing by means of block-online beamforming that recursively updates the signal statistics. Experimental results show that the additional spatial information clearly improves source extraction, in particular in the same-gender case, and that our proposal achieves state-of-the-art performance in terms of distortion reduction and recognition accuracy.}},
  author       = {{Martin-Donas, Juan M. and Heitkaemper, Jens and Haeb-Umbach, Reinhold and Gomez, Angel M. and Peinado, Antonio M.}},
  booktitle    = {{INTERSPEECH 2019, Graz, Austria}},
  title        = {{{Multi-Channel Block-Online Source Extraction based on Utterance Adaptation}}},
  year         = {{2019}},
}

@inproceedings{14826,
  abstract     = {{In this paper, we present Hitachi and Paderborn University’s joint effort for automatic speech recognition (ASR) in a dinner party scenario. The main challenges of ASR systems for dinner party recordings obtained by multiple microphone arrays are (1) heavy speech overlaps, (2) severe noise and reverberation, (3) very natural onversational content, and possibly (4) insufficient training data. As an example of a dinner party scenario, we have chosen the data presented during the CHiME-5 speech recognition challenge, where the baseline ASR had a 73.3% word error rate (WER), and even the best performing system at the CHiME-5 challenge had a 46.1% WER. We extensively investigated a combination of the guided source separation-based speech enhancement technique and an already proposed strong ASR backend and found that a tight combination of these techniques provided substantial accuracy improvements. Our final system achieved WERs of 39.94% and 41.64% for the development and evaluation data, respectively, both of which are the best published results for the dataset. We also investigated with additional training data on the official small data in the CHiME-5 corpus to assess the intrinsic difficulty of this ASR task.}},
  author       = {{Kanda, Naoyuki and Boeddeker, Christoph and Heitkaemper, Jens and Fujita, Yusuke and Horiguchi, Shota and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2019, Graz, Austria}},
  title        = {{{Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR}}},
  year         = {{2019}},
}

@inproceedings{13271,
  abstract     = {{Automatic meeting analysis comprises the tasks of speaker counting, speaker diarization, and the separation of overlapped speech, followed by automatic speech recognition. This all has to be carried out on arbitrarily long sessions and, ideally, in an online or block-online manner. While significant progress has been made on individual tasks, this paper presents for the first time an all-neural approach to simultaneous speaker counting, diarization and source separation. The NN-based estimator operates in a block-online fashion and tracks speakers even if they remain silent for a number of time blocks, thus learning a stable output order for the separated sources. The neural network is recurrent over time as well as over the number of sources. The simulation experiments show that state of the art separation performance is achieved, while at the same time delivering good diarization and source counting results. It even generalizes well to an unseen large number of blocks.}},
  author       = {{von Neumann, Thilo and Kinoshita, Keisuke and Delcroix, Marc and Araki, Shoko and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2019, Brighton, UK}},
  title        = {{{All-neural Online Source Separation, Counting, and Diarization for Meeting Analysis}}},
  year         = {{2019}},
}

@article{15814,
  abstract     = {{Once a popular theme of futuristic science fiction or far-fetched technology forecasts, digital home assistants with a spoken language interface have become a ubiquitous commodity today. This success has been made possible by major advancements in signal processing and machine learning for so-called far-field speech recognition, where the commands are spoken at a distance from the sound capturing device. The challenges encountered are quite unique and different from many other use cases of automatic speech recognition. The purpose of this tutorial article is to describe, in a way amenable to the non-specialist, the key speech processing algorithms that enable reliable fully hands-free speech interaction with digital home assistants. These technologies include multi-channel acoustic echo cancellation, microphone array processing and dereverberation techniques for signal enhancement, reliable wake-up word and end-of-interaction detection, high-quality speech synthesis, as well as sophisticated statistical models for speech and language, learned from large amounts of heterogeneous training data. In all these fields, deep learning has occupied a critical role.}},
  author       = {{Haeb-Umbach, Reinhold and Watanabe, Shinji and Nakatani, Tomohiro and Bacchiani, Michiel and Hoffmeister, Bjoern and Seltzer, Michael L. and Zen, Heiga and Souden, Mehrez}},
  issn         = {{1558-0792}},
  journal      = {{IEEE Signal Processing Magazine}},
  number       = {{6}},
  pages        = {{111--124}},
  title        = {{{Speech Processing for Digital Home Assistance: Combining Signal Processing With Deep-Learning Techniques}}},
  doi          = {{10.1109/MSP.2019.2918706}},
  volume       = {{36}},
  year         = {{2019}},
}

@article{19450,
  abstract     = {{Wenn akustische Signalverarbeitung mit automatisiertem Lernen verknüpft wird: Nachrichtentechniker arbeiten mit mehreren Mikrofonen und tiefen neuronalen Netzen an besserer Spracherkennung unter widrigsten Bedingungen. Von solchen Sensornetzwerken könnten langfristig auch digitale Sprachassistenten profitieren.}},
  author       = {{Haeb-Umbach, Reinhold}},
  journal      = {{DFG forschung 1/2019}},
  pages        = {{12--15}},
  title        = {{{Lektionen für Alexa & Co?!}}},
  doi          = {{10.1002/fors.201970104}},
  year         = {{2019}},
}

@inproceedings{15237,
  abstract     = {{This  paper  presents  an  approach  to  voice  conversion,  whichdoes neither require parallel data nor speaker or phone labels fortraining.  It can convert between speakers which are not in thetraining set by employing the previously proposed concept of afactorized hierarchical variational autoencoder. Here, linguisticand speaker induced variations are separated upon the notionthat content induced variations change at a much shorter timescale, i.e., at the segment level, than speaker induced variations,which vary at the longer utterance level. In this contribution wepropose to employ convolutional instead of recurrent networklayers  in  the  encoder  and  decoder  blocks,  which  is  shown  toachieve better phone recognition accuracy on the latent segmentvariables at frame-level due to their better temporal resolution.For voice conversion the mean of the utterance variables is re-placed with the respective estimated mean of the target speaker.The resulting log-mel spectra of the decoder output are used aslocal conditions of a WaveNet which is utilized for synthesis ofthe speech waveforms.  Experiments show both good disentan-glement properties of the latent space variables, and good voiceconversion performance.}},
  author       = {{Gburrek, Tobias and Glarner, Thomas and Ebbers, Janek and Haeb-Umbach, Reinhold and Wagner, Petra}},
  booktitle    = {{Proc. 10th ISCA Speech Synthesis Workshop}},
  location     = {{Vienna}},
  pages        = {{81--86}},
  title        = {{{Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion}}},
  doi          = {{10.21437/SSW.2019-15}},
  year         = {{2019}},
}

@inproceedings{15794,
  abstract     = {{In this paper we present our audio tagging system for the DCASE 2019 Challenge Task 2. We propose a model consisting of a convolutional front end using log-mel-energies as input features, a recurrent neural network sequence encoder and a fully connected classifier network outputting an activity probability for each of the 80 considered event classes. Due to the recurrent neural network, which encodes a whole sequence into a single vector, our model is able to process sequences of varying lengths. The model is trained with only little manually labeled training data and a larger amount of automatically labeled web data, which hence suffers from label noise. To efficiently train the model with the provided data we use various data augmentation to prevent overfitting and improve generalization. Our best submitted system achieves a label-weighted label-ranking average precision (lwlrap) of 75.5% on the private test set which is an absolute improvement of 21.7% over the baseline. This system scored the second place in the teams ranking of the DCASE 2019 Challenge Task 2 and the fifth place in the Kaggle competition “Freesound Audio Tagging 2019” with more than 400 participants. After the challenge ended we further improved performance to 76.5% lwlrap setting a new state-of-the-art on this dataset.}},
  author       = {{Ebbers, Janek and Haeb-Umbach, Reinhold}},
  booktitle    = {{DCASE2019 Workshop, New York, USA}},
  title        = {{{Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision}}},
  year         = {{2019}},
}

@inproceedings{15796,
  abstract     = {{In this paper we consider human daily activity recognition using an acoustic sensor network (ASN) which consists of nodes distributed in a home environment. Assuming that the ASN is permanently recording, the vast majority of recordings is silence. Therefore, we propose to employ a computationally efficient two-stage sound recognition system, consisting of an initial sound activity detection (SAD) and a subsequent sound event classification (SEC), which is only activated once sound activity has been detected. We show how a low-latency activity detector with high temporal resolution can be trained from weak labels with low temporal resolution. We further demonstrate the advantage of using spatial features for the subsequent event classification task.}},
  author       = {{Ebbers, Janek and Drude, Lukas and Haeb-Umbach, Reinhold and Brendel, Andreas and Kellermann, Walter}},
  booktitle    = {{CAMSAP 2019, Guadeloupe, West Indies}},
  title        = {{{Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks}}},
  year         = {{2019}},
}

@inproceedings{15792,
  abstract     = {{In this paper we highlight the privacy risks entailed in deep neural network feature extraction for domestic activity monitoring. We employ the baseline system proposed in the Task 5 of the DCASE 2018 challenge and simulate a feature interception attack by an eavesdropper who wants to perform speaker identification. We then propose to reduce the aforementioned privacy risks by introducing a variational information feature extraction scheme that allows for good activity monitoring performance while at the same time minimizing the information of the feature representation, thus restricting speaker identification attempts. We analyze the resulting model’s composite loss function and the budget scaling factor used to control the balance between the performance of the trusted and attacker tasks. It is empirically demonstrated that the proposed method reduces speaker identification privacy risks without significantly deprecating the performance of domestic activity monitoring tasks.}},
  author       = {{Nelus, Alexandru and Ebbers, Janek and Haeb-Umbach, Reinhold and Martin, Rainer}},
  booktitle    = {{INTERSPEECH 2019, Graz, Austria}},
  title        = {{{Privacy-preserving Variational Information Feature Extraction for Domestic Activity Monitoring Versus Speaker Identification}}},
  year         = {{2019}},
}

@inproceedings{11760,
  abstract     = {{Acoustic event detection, i.e., the task of assigning a human interpretable label to a segment of audio, has only recently attracted increased interest in the research community. Driven by the DCASE challenges and the availability of large-scale audio datasets, the state-of-the-art has progressed rapidly with deep-learning-based classi- fiers dominating the field. Because several potential use cases favor a realization on distributed sensor nodes, e.g. ambient assisted living applications, habitat monitoring or surveillance, we are concerned with two issues here. Firstly the classification performance of such systems and secondly the computing resources required to achieve a certain performance considering node level feature extraction. In this contribution we look at the balance between the two criteria by employing traditional techniques and different deep learning architectures, including convolutional and recurrent models in the context of real life everyday audio recordings in realistic, however challenging, multisource conditions.}},
  author       = {{Ebbers, Janek and Nelus, Alexandru and Martin, Rainer and Haeb-Umbach, Reinhold}},
  booktitle    = {{DAGA 2018, München}},
  title        = {{{Evaluation of Modulation-MFCC Features and DNN Classification for Acoustic Event Detection}}},
  year         = {{2018}},
}

@inproceedings{11835,
  abstract     = {{Signal dereverberation using the weighted prediction error (WPE) method has been proven to be an effective means to raise the accuracy of far-field speech recognition. But in its original formulation, WPE requires multiple iterations over a sufficiently long utterance, rendering it unsuitable for online low-latency applications. Recently, two methods have been proposed to overcome this limitation. One utilizes a neural network to estimate the power spectral density (PSD) of the target signal and works in a block-online fashion. The other method relies on a rather simple PSD estimation which smoothes the observed PSD and utilizes a recursive formulation which enables it to work on a frame-by-frame basis. In this paper, we integrate a deep neural network (DNN) based estimator into the recursive frame-online formulation. We evaluate the performance of the recursive system with different PSD estimators in comparison to the block-online and offline variant on two distinct corpora. The REVERB challenge data, where the signal is mainly deteriorated by reverberation, and a database which combines WSJ and VoiceHome to also consider (directed) noise sources. The results show that although smoothing works surprisingly well, the more sophisticated DNN based estimator shows promising improvements and shortens the performance gap between online and offline processing.}},
  author       = {{Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold and Kinoshita, Keisuke and Nakatani, Tomohiro}},
  booktitle    = {{IWAENC 2018, Tokio, Japan}},
  title        = {{{Frame-Online DNN-WPE Dereverberation}}},
  year         = {{2018}},
}

@inproceedings{11837,
  abstract     = {{We present a block-online multi-channel front end for automatic speech recognition in noisy and reverberated environments. It is an online version of our earlier proposed neural network supported acoustic beamformer, whose coefficients are calculated from noise and speech spatial covariance matrices which are estimated utilizing a neural mask estimator. However, the sparsity of speech in the STFT domain causes problems for the initial beamformer coefficients estimation in some frequency bins due to lack of speech observations. We propose two methods to mitigate this issue. The first is to lower the frequency resolution of the STFT, which comes with the additional advantage of a reduced time window, thus lowering the latency introduced by block processing. The second approach is to smooth beamforming coefficients along the frequency axis, thus exploiting their high interfrequency correlation. With both approaches the gap between offline and block-online beamformer performance, as measured by the word error rate achieved by a downstream speech recognizer, is significantly reduced. Experiments are carried out on two copora, representing noisy (CHiME-4) and noisy reverberant (voiceHome) environments.}},
  author       = {{Heitkaemper, Jens and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG 2018, Oldenburg, Germany}},
  title        = {{{Smoothing along Frequency in Online Neural Network Supported Acoustic Beamforming}}},
  year         = {{2018}},
}

@inproceedings{11872,
  abstract     = {{The weighted prediction error (WPE) algorithm has proven to be a very successful dereverberation method for the REVERB challenge. Likewise, neural network based mask estimation for beamforming demonstrated very good noise suppression in the CHiME 3 and CHiME 4 challenges. Recently, it has been shown that this estimator can also be trained to perform dereverberation and denoising jointly. However, up to now a comparison of a neural beamformer and WPE is still missing, so is an investigation into a combination of the two. Therefore, we here provide an extensive evaluation of both and consequently propose variants to integrate deep neural network based beamforming with WPE. For these integrated variants we identify a consistent word error rate (WER) reduction on two distinct databases. In particular, our study shows that deep learning based beamforming benefits from a model-based dereverberation technique (i.e. WPE) and vice versa. Our key findings are: (a) Neural beamforming yields the lower WERs in comparison to WPE the more channels and noise are present. (b) Integration of WPE and a neural beamformer consistently outperforms all stand-alone systems.}},
  author       = {{Drude, Lukas and Boeddeker, Christoph and Heymann, Jahn and Kinoshita, Keisuke and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2018, Hyderabad, India}},
  title        = {{{Integration neural network based beamforming and weighted prediction error dereverberation}}},
  year         = {{2018}},
}

@inproceedings{11873,
  abstract     = {{NARA-WPE is a Python software package providing implementations of the weighted prediction error (WPE) dereverberation algorithm. WPE has been shown to be a highly effective tool for speech dereverberation, thus improving the perceptual quality of the signal and improving the recognition performance of downstream automatic speech recognition (ASR). It is suitable both for single-channel and multi-channel applications. The package consist of (1) a Numpy implementation which can easily be integrated into a custom Python toolchain, and (2) a TensorFlow implementation which allows integration into larger computational graphs and enables backpropagation through WPE to train more advanced front-ends. This package comprises of an iterative offline (batch) version, a block-online version, and a frame-online version which can be used in moderately low latency applications, e.g. digital speech assistants.}},
  author       = {{Drude, Lukas and Heymann, Jahn and Boeddeker, Christoph and Haeb-Umbach, Reinhold}},
  booktitle    = {{ITG 2018, Oldenburg, Germany}},
  title        = {{{NARA-WPE: A Python package for weighted prediction error dereverberation in Numpy and Tensorflow for online and offline processing}}},
  year         = {{2018}},
}

@article{11916,
  abstract     = {{We present an experimental comparison of seven state-of-the-art machine learning algorithms for the task of semantic analysis of spoken input, with a special emphasis on applications for dysarthric speech. Dysarthria is a motor speech disorder, which is characterized by poor articulation of phonemes. In order to cater for these noncanonical phoneme realizations, we employed an unsupervised learning approach to estimate the acoustic models for speech recognition, which does not require a literal transcription of the training data. Even for the subsequent task of semantic analysis, only weak supervision is employed, whereby the training utterance is accompanied by a semantic label only, rather than a literal transcription. Results on two databases, one of them containing dysarthric speech, are presented showing that Markov logic networks and conditional random fields substantially outperform other machine learning approaches. Markov logic networks have proved to be especially robust to recognition errors, which are caused by imprecise articulation in dysarthric speech.}},
  author       = {{Despotovic, Vladimir and Walter, Oliver and Haeb-Umbach, Reinhold}},
  journal      = {{Speech Communication 99 (2018) 242-251 (Elsevier B.V.)}},
  title        = {{{Machine learning techniques for semantic analysis of dysarthric speech: An experimental study}}},
  year         = {{2018}},
}

@inproceedings{12898,
  abstract     = {{Deep clustering (DC) and deep attractor networks (DANs) are a data-driven way to monaural blind source separation. Both approaches provide astonishing single channel performance but have not yet been generalized to block-online processing. When separating speech in a continuous stream with a block-online algorithm, it needs to be determined in each block which of the output streams belongs to whom. In this contribution we solve this block permutation problem by introducing an additional speaker identification embedding to the DAN model structure. We motivate this model decision by analyzing the embedding topology of DC and DANs and show, that DC and DANs themselves are not sufficient for speaker identification. This model structure (a) improves the signal to distortion ratio (SDR) over a DAN baseline and (b) provides up to 61% and up to 34% relative reduction in permutation error rate and re-identification error rate compared to an i-vector baseline, respectively.}},
  author       = {{Drude, Lukas and von Neumann, Thilo and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2018, Calgary, Canada}},
  title        = {{{Deep Attractor Networks for Speaker Re-Identifikation and Blind Source Separation}}},
  year         = {{2018}},
}

@inproceedings{12900,
  abstract     = {{Deep attractor networks (DANs) are a recently introduced method to blindly separate sources from spectral features of a monaural recording using bidirectional long short-term memory networks (BLSTMs). Due to the nature of BLSTMs, this is inherently not online-ready and resorting to operating on blocks yields a block permutation problem in that the index of each speaker may change between blocks. We here propose the joint modeling of spatial and spectral features to solve the block permutation problem and generalize DANs to multi-channel meeting recordings: The DAN acts as a spectral feature extractor for a subsequent model-based clustering approach. We first analyze different joint models in batch-processing scenarios and finally propose a block-online blind source separation algorithm. The efficacy of the proposed models is demonstrated on reverberant mixtures corrupted by real recordings of multi-channel background noise. We demonstrate that both the proposed batch-processing and the proposed block-online system outperform (a) a spatial-only model with a state-of-the-art frequency permutation solver and (b) a spectral-only model with an oracle block permutation solver in terms of signal to distortion ratio (SDR) gains.}},
  author       = {{Drude, Lukas and Higuchi,,  Takuya  and Kinoshita, Keisuke  and Nakatani, Tomohiro  and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2018, Calgary, Canada}},
  title        = {{{Dual Frequency- and Block-Permutation Alignment for Deep Learning Based Block-Online Blind Source Separation}}},
  year         = {{2018}},
}