@inproceedings{20753,
  abstract     = {{In this paper we present our system for the detection and classification of acoustic scenes and events (DCASE) 2020 Challenge Task 4: Sound event detection and separation in domestic environments. We introduce two new models: the forward-backward convolutional recurrent neural network (FBCRNN) and the tag-conditioned convolutional neural network (CNN). The FBCRNN employs two recurrent neural network (RNN) classifiers sharing the same CNN for preprocessing. With one RNN processing a recording in forward direction and the other in backward direction, the two networks are trained to jointly predict audio tags, i.e., weak labels, at each time step within a recording, given that at each time step they have jointly processed the whole recording. The proposed training encourages the classifiers to tag events as soon as possible. Therefore, after training, the networks can be applied to shorter audio segments of, e.g., 200ms, allowing sound event detection (SED). Further, we propose a tag-conditioned CNN to complement SED. It is trained to predict strong labels while using (predicted) tags, i.e., weak labels, as additional input. For training pseudo strong labels from a FBCRNN ensemble are used. The presented system scored the fourth and third place in the systems and teams rankings, respectively. Subsequent improvements allow our system to even outperform the challenge baseline and winner systems in average by, respectively, 18.0% and 2.2% event-based F1-score on the validation set. Source code is publicly available at https://github.com/fgnt/pb_sed.}},
  author       = {{Ebbers, Janek and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)}},
  title        = {{{Forward-Backward Convolutional Recurrent Neural Networks and Tag-Conditioned Convolutional Neural Networks for Weakly Labeled Semi-Supervised Sound Event Detection}}},
  year         = {{2020}},
}

@inproceedings{20695,
  author       = {{Boeddeker, Christoph and Nakatani, Tomohiro and Kinoshita, Keisuke and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
  isbn         = {{9781509066315}},
  title        = {{{Jointly Optimal Dereverberation and Beamforming}}},
  doi          = {{10.1109/icassp40776.2020.9054393}},
  year         = {{2020}},
}

@article{17762,
  abstract     = {{Abstract Wenn akustische Signalverarbeitung mit automatisiertem Lernen verknüpft wird: Nachrichtentechniker arbeiten mit mehreren Mikrofonen und tiefen neuronalen Netzen an besserer Spracherkennung unter widrigsten Bedingungen. Von solchen Sensornetzwerken könnten langfristig auch digitale Sprachassistenten profitieren.}},
  author       = {{Haeb-Umbach, Reinhold}},
  journal      = {{forschung}},
  number       = {{1}},
  pages        = {{12--15}},
  title        = {{{Lektionen für Alexa \& Co?!}}},
  doi          = {{10.1002/fors.201970104}},
  volume       = {{44}},
  year         = {{2019}},
}

@article{19446,
  abstract     = {{We present a multi-channel database of overlapping speech for training, evaluation, and detailed analysis of source separation and extraction algorithms: SMS-WSJ -- Spatialized Multi-Speaker Wall Street Journal. It consists of artificially mixed speech taken from the WSJ database, but unlike earlier databases we consider all WSJ0+1 utterances and take care of strictly separating the speaker sets present in the training, validation and test sets. When spatializing the data we ensure a high degree of randomness w.r.t. room size, array center and rotation, as well as speaker position. Furthermore, this paper offers a critical assessment of recently proposed measures of source separation performance. Alongside the code to generate the database we provide a source separation baseline and a Kaldi recipe with competitive word error rates to provide common ground for evaluation.}},
  author       = {{Drude, Lukas and Heitkaemper, Jens and Boeddeker, Christoph and Haeb-Umbach, Reinhold}},
  journal      = {{ArXiv e-prints}},
  title        = {{{SMS-WSJ: Database, performance measures, and baseline recipe for multi-channel source separation and recognition}}},
  year         = {{2019}},
}

@inproceedings{11965,
  abstract     = {{We present an unsupervised training approach for a neural network-based mask estimator in an acoustic beamforming application. The network is trained to maximize a likelihood criterion derived from a spatial mixture model of the observations. It is trained from scratch without requiring any parallel data consisting of degraded input and clean training targets. Thus, training can be carried out on real recordings of noisy speech rather than simulated ones. In contrast to previous work on unsupervised training of neural mask estimators, our approach avoids the need for a possibly pre-trained teacher model entirely. We demonstrate the effectiveness of our approach by speech recognition experiments on two different datasets: one mainly deteriorated by noise (CHiME 4) and one by reverberation (REVERB). The results show that the performance of the proposed system is on par with a supervised system using oracle target masks for training and with a system trained using a model-based teacher.}},
  author       = {{Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2019, Graz, Austria}},
  title        = {{{Unsupervised training of neural mask-based beamforming}}},
  year         = {{2019}},
}

@inproceedings{12874,
  abstract     = {{We propose a training scheme to train neural network-based source separation algorithms from scratch when parallel clean data is unavailable. In particular, we demonstrate that an unsupervised spatial clustering algorithm is sufficient to guide the training of a deep clustering system. We argue that previous work on deep clustering requires strong supervision and elaborate on why this is a limitation. We demonstrate that (a) the single-channel deep clustering system trained according to the proposed scheme alone is able to achieve a similar performance as the multi-channel teacher in terms of word error rates and (b) initializing the spatial clustering approach with the deep clustering result yields a relative word error rate reduction of 26% over the unsupervised teacher.}},
  author       = {{Drude, Lukas and Hasenklever, Daniel and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2019, Brighton, UK}},
  title        = {{{Unsupervised Training of a Deep Clustering Model for Multichannel Blind Source Separation}}},
  year         = {{2019}},
}

@inproceedings{12875,
  abstract     = {{Signal dereverberation using the Weighted Prediction Error (WPE) method has been proven to be an effective means to raise the accuracy of far-field speech recognition. First proposed as an iterative algorithm, follow-up works have reformulated it as a recursive least squares algorithm and therefore enabled its use in online applications. For this algorithm, the estimation of the power spectral density (PSD) of the anechoic signal plays an important role and strongly influences its performance. Recently, we showed that using a neural network PSD estimator leads to improved performance for online automatic speech recognition. This, however, comes at a price. To train the network, we require parallel data, i.e., utterances simultaneously available in clean and reverberated form. Here we propose to overcome this limitation by training the network jointly with the acoustic model of the speech recognizer. To be specific, the gradients computed from the cross-entropy loss between the target senone sequence and the acoustic model network output is backpropagated through the complex-valued dereverberation filter estimation to the neural network for PSD estimation. Evaluation on two databases demonstrates improved performance for on-line processing scenarios while imposing fewer requirements on the available training data and thus widening the range of applications.}},
  author       = {{Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold and Kinoshita, Keisuke and Nakatani, Tomohiro}},
  booktitle    = {{ICASSP 2019, Brighton, UK}},
  title        = {{{Joint Optimization of Neural Network-based WPE Dereverberation and Acoustic Model for Robust Online ASR}}},
  year         = {{2019}},
}

@inproceedings{12876,
  abstract     = {{In this paper, we present libDirectional, a MATLAB library for directional statistics and directional estimation. It supports a variety of commonly used distributions on the unit circle, such as the von Mises, wrapped normal, and wrapped Cauchy distributions. Furthermore, various distributions on higher-dimensional manifolds such as the unit hypersphere and the hypertorus are available. Based on these distributions, several recursive filtering algorithms in libDirectional allow estimation on these manifolds. The functionality is implemented in a clear, well-documented, and object-oriented structure that is both easy to use and easy to extend.}},
  author       = {{Kurz, Gerhard and Gilitschenski, Igor and Pfaff, Florian and Drude, Lukas and Hanebeck, Uwe D. and Haeb-Umbach, Reinhold and Siegwart, Roland Y.}},
  booktitle    = {{Journal of Statistical Software 89(4)}},
  title        = {{{Directional Statistics and Filtering Using libDirectional}}},
  year         = {{2019}},
}

@article{12890,
  abstract     = {{We formulate a generic framework for blind source separation (BSS), which allows integrating data-driven spectro-temporal methods, such as deep clustering and deep attractor networks, with physically motivated probabilistic spatial methods, such as complex angular central Gaussian mixture models. The integrated model exploits the complementary strengths of the two approaches to BSS: the strong modeling power of neural networks, which, however, is based on supervised learning, and the ease of unsupervised learning of the spatial mixture models whose few parameters can be estimated on as little as a single segment of a real mixture of speech. Experiments are carried out on both artificially mixed speech and true recordings of speech mixtures. The experiments verify that the integrated models consistently outperform the individual components. We further extend the models to cope with noisy, reverberant speech and introduce a cross-domain teacher–student training where the mixture model serves as the teacher to provide training targets for the student neural network.}},
  author       = {{Drude, Lukas and Haeb-Umbach, Reinhold}},
  issn         = {{1941-0484}},
  journal      = {{IEEE Journal of Selected Topics in Signal Processing}},
  title        = {{{Integration of Neural Networks and Probabilistic Spatial Models for Acoustic Blind Source Separation}}},
  doi          = {{10.1109/JSTSP.2019.2912565}},
  year         = {{2019}},
}

@inproceedings{15812,
  abstract     = {{Connectionist temporal classification (CTC) is a sequence-level loss that has been successfully applied to train recurrent neural network (RNN) models for automatic speech recognition. However, one major weakness of CTC is the conditional independence assumption that makes it difficult for the model to learn label dependencies. In this paper, we propose stimulated CTC, which uses stimulated learning to help CTC models learn label dependencies implicitly by using an auxiliary RNN to generate the appropriate stimuli. This stimuli comes in the form of an additional stimulation loss term which encourages the model to learn said label dependencies. The auxiliary network is only used during training and the inference model has the same structure as a standard CTC model. The proposed stimulated CTC model achieves about 35% relative character error rate improvements on a synthetic gesture keyboard recognition task and over 30% relative word error rate improvements on the Librispeech automatic speech recognition tasks over a baseline model trained with CTC only.}},
  author       = {{Heymann, Jahn and Khe Chai Sim, Bo Li}},
  booktitle    = {{ICASSP 2019, Brighton, UK}},
  title        = {{{Improving CTC Using Stimulated Learning for Sequence Modeling}}},
  year         = {{2019}},
}

@inproceedings{15816,
  abstract     = {{Despite the strong modeling power of neural network acoustic models, speech enhancement has been shown to deliver additional word error rate improvements if multi-channel data is available. However, there has been a longstanding debate whether enhancement should also be carried out on the ASR training data. In an extensive experimental evaluation on the acoustically very challenging CHiME-5 dinner party data we show that: (i) cleaning up the training data can lead to substantial error rate reductions, and (ii) enhancement in training is advisable as long as enhancement in test is at least as strong as in training. This approach stands in contrast and delivers larger gains than the common strategy reported in the literature to augment the training database with additional artificially degraded speech. Together with an acoustic model topology consisting of initial CNN layers followed by factorized TDNN layers we achieve with 41.6% and 43.2% WER on the DEV and EVAL test sets, respectively, a new single-system state-of-the-art result on the CHiME-5 data. This is a 8% relative improvement compared to the best word error rate published so far for a speech recognizer without system combination.}},
  author       = {{Zorila, Catalin and Boeddeker, Christoph and Doddipatla, Rama and Haeb-Umbach, Reinhold}},
  booktitle    = {{ASRU 2019, Sentosa, Singapore}},
  title        = {{{An Investigation Into the Effectiveness of Enhancement in ASR Training and Test for Chime-5 Dinner Party Transcription}}},
  year         = {{2019}},
}

@inproceedings{14822,
  abstract     = {{Multi-talker speech and moving speakers still pose a significant challenge to automatic speech recognition systems. Assuming an enrollment utterance of the target speakeris available, the so-called SpeakerBeam concept has been recently proposed to extract the target speaker from a speech mixture. If multi-channel input is available, spatial properties of the speaker can be exploited to support the source extraction. In this contribution we investigate different approaches to exploit such spatial information. In particular, we are interested in the question, how useful this information is if the target speaker changes his/her position. To this end, we present a SpeakerBeam-based source extraction network that is adapted to work on moving speakers by recursively updating the beamformer coefficients. Experimental results are presented on two data sets, one with articially created room impulse responses, and one with real room impulse responses and noise recorded in a conference room. Interestingly, spatial features turn out to be advantageous even if the speaker position changes.}},
  author       = {{Heitkaemper, Jens and Feher, Thomas and Freitag, Michael and Haeb-Umbach, Reinhold}},
  booktitle    = {{International Conference on Statistical Language and Speech Processing 2019, Ljubljana, Slovenia}},
  title        = {{{A Study on Online Source Extraction in the Presence of Changing Speaker Positions}}},
  year         = {{2019}},
}

@inproceedings{14824,
  abstract     = {{This paper deals with multi-channel speech recognition in scenarios with multiple speakers. Recently, the spectral characteristics of a target speaker, extracted from an adaptation utterance, have been used to guide a neural network mask estimator to focus on that speaker. In this work we present two variants of speakeraware neural networks, which exploit both spectral and spatial information to allow better discrimination between target and interfering speakers. Thus, we introduce either a spatial preprocessing prior to the mask estimation or a spatial plus spectral speaker characterization block whose output is directly fed into the neural mask estimator. The target speaker’s spectral and spatial signature is extracted from an adaptation utterance recorded at the beginning of a session. We further adapt the architecture for low-latency processing by means of block-online beamforming that recursively updates the signal statistics. Experimental results show that the additional spatial information clearly improves source extraction, in particular in the same-gender case, and that our proposal achieves state-of-the-art performance in terms of distortion reduction and recognition accuracy.}},
  author       = {{Martin-Donas, Juan M. and Heitkaemper, Jens and Haeb-Umbach, Reinhold and Gomez, Angel M. and Peinado, Antonio M.}},
  booktitle    = {{INTERSPEECH 2019, Graz, Austria}},
  title        = {{{Multi-Channel Block-Online Source Extraction based on Utterance Adaptation}}},
  year         = {{2019}},
}

@inproceedings{14826,
  abstract     = {{In this paper, we present Hitachi and Paderborn University’s joint effort for automatic speech recognition (ASR) in a dinner party scenario. The main challenges of ASR systems for dinner party recordings obtained by multiple microphone arrays are (1) heavy speech overlaps, (2) severe noise and reverberation, (3) very natural onversational content, and possibly (4) insufficient training data. As an example of a dinner party scenario, we have chosen the data presented during the CHiME-5 speech recognition challenge, where the baseline ASR had a 73.3% word error rate (WER), and even the best performing system at the CHiME-5 challenge had a 46.1% WER. We extensively investigated a combination of the guided source separation-based speech enhancement technique and an already proposed strong ASR backend and found that a tight combination of these techniques provided substantial accuracy improvements. Our final system achieved WERs of 39.94% and 41.64% for the development and evaluation data, respectively, both of which are the best published results for the dataset. We also investigated with additional training data on the official small data in the CHiME-5 corpus to assess the intrinsic difficulty of this ASR task.}},
  author       = {{Kanda, Naoyuki and Boeddeker, Christoph and Heitkaemper, Jens and Fujita, Yusuke and Horiguchi, Shota and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2019, Graz, Austria}},
  title        = {{{Guided Source Separation Meets a Strong ASR Backend: Hitachi/Paderborn University Joint Investigation for Dinner Party ASR}}},
  year         = {{2019}},
}

@inproceedings{13271,
  abstract     = {{Automatic meeting analysis comprises the tasks of speaker counting, speaker diarization, and the separation of overlapped speech, followed by automatic speech recognition. This all has to be carried out on arbitrarily long sessions and, ideally, in an online or block-online manner. While significant progress has been made on individual tasks, this paper presents for the first time an all-neural approach to simultaneous speaker counting, diarization and source separation. The NN-based estimator operates in a block-online fashion and tracks speakers even if they remain silent for a number of time blocks, thus learning a stable output order for the separated sources. The neural network is recurrent over time as well as over the number of sources. The simulation experiments show that state of the art separation performance is achieved, while at the same time delivering good diarization and source counting results. It even generalizes well to an unseen large number of blocks.}},
  author       = {{von Neumann, Thilo and Kinoshita, Keisuke and Delcroix, Marc and Araki, Shoko and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2019, Brighton, UK}},
  title        = {{{All-neural Online Source Separation, Counting, and Diarization for Meeting Analysis}}},
  year         = {{2019}},
}

@article{15814,
  abstract     = {{Once a popular theme of futuristic science fiction or far-fetched technology forecasts, digital home assistants with a spoken language interface have become a ubiquitous commodity today. This success has been made possible by major advancements in signal processing and machine learning for so-called far-field speech recognition, where the commands are spoken at a distance from the sound capturing device. The challenges encountered are quite unique and different from many other use cases of automatic speech recognition. The purpose of this tutorial article is to describe, in a way amenable to the non-specialist, the key speech processing algorithms that enable reliable fully hands-free speech interaction with digital home assistants. These technologies include multi-channel acoustic echo cancellation, microphone array processing and dereverberation techniques for signal enhancement, reliable wake-up word and end-of-interaction detection, high-quality speech synthesis, as well as sophisticated statistical models for speech and language, learned from large amounts of heterogeneous training data. In all these fields, deep learning has occupied a critical role.}},
  author       = {{Haeb-Umbach, Reinhold and Watanabe, Shinji and Nakatani, Tomohiro and Bacchiani, Michiel and Hoffmeister, Bjoern and Seltzer, Michael L. and Zen, Heiga and Souden, Mehrez}},
  issn         = {{1558-0792}},
  journal      = {{IEEE Signal Processing Magazine}},
  number       = {{6}},
  pages        = {{111--124}},
  title        = {{{Speech Processing for Digital Home Assistance: Combining Signal Processing With Deep-Learning Techniques}}},
  doi          = {{10.1109/MSP.2019.2918706}},
  volume       = {{36}},
  year         = {{2019}},
}

@article{19450,
  abstract     = {{Wenn akustische Signalverarbeitung mit automatisiertem Lernen verknüpft wird: Nachrichtentechniker arbeiten mit mehreren Mikrofonen und tiefen neuronalen Netzen an besserer Spracherkennung unter widrigsten Bedingungen. Von solchen Sensornetzwerken könnten langfristig auch digitale Sprachassistenten profitieren.}},
  author       = {{Haeb-Umbach, Reinhold}},
  journal      = {{DFG forschung 1/2019}},
  pages        = {{12--15}},
  title        = {{{Lektionen für Alexa & Co?!}}},
  doi          = {{10.1002/fors.201970104}},
  year         = {{2019}},
}

@inproceedings{15237,
  abstract     = {{This  paper  presents  an  approach  to  voice  conversion,  whichdoes neither require parallel data nor speaker or phone labels fortraining.  It can convert between speakers which are not in thetraining set by employing the previously proposed concept of afactorized hierarchical variational autoencoder. Here, linguisticand speaker induced variations are separated upon the notionthat content induced variations change at a much shorter timescale, i.e., at the segment level, than speaker induced variations,which vary at the longer utterance level. In this contribution wepropose to employ convolutional instead of recurrent networklayers  in  the  encoder  and  decoder  blocks,  which  is  shown  toachieve better phone recognition accuracy on the latent segmentvariables at frame-level due to their better temporal resolution.For voice conversion the mean of the utterance variables is re-placed with the respective estimated mean of the target speaker.The resulting log-mel spectra of the decoder output are used aslocal conditions of a WaveNet which is utilized for synthesis ofthe speech waveforms.  Experiments show both good disentan-glement properties of the latent space variables, and good voiceconversion performance.}},
  author       = {{Gburrek, Tobias and Glarner, Thomas and Ebbers, Janek and Haeb-Umbach, Reinhold and Wagner, Petra}},
  booktitle    = {{Proc. 10th ISCA Speech Synthesis Workshop}},
  location     = {{Vienna}},
  pages        = {{81--86}},
  title        = {{{Unsupervised Learning of a Disentangled Speech Representation for Voice Conversion}}},
  doi          = {{10.21437/SSW.2019-15}},
  year         = {{2019}},
}

@inproceedings{15794,
  abstract     = {{In this paper we present our audio tagging system for the DCASE 2019 Challenge Task 2. We propose a model consisting of a convolutional front end using log-mel-energies as input features, a recurrent neural network sequence encoder and a fully connected classifier network outputting an activity probability for each of the 80 considered event classes. Due to the recurrent neural network, which encodes a whole sequence into a single vector, our model is able to process sequences of varying lengths. The model is trained with only little manually labeled training data and a larger amount of automatically labeled web data, which hence suffers from label noise. To efficiently train the model with the provided data we use various data augmentation to prevent overfitting and improve generalization. Our best submitted system achieves a label-weighted label-ranking average precision (lwlrap) of 75.5% on the private test set which is an absolute improvement of 21.7% over the baseline. This system scored the second place in the teams ranking of the DCASE 2019 Challenge Task 2 and the fifth place in the Kaggle competition “Freesound Audio Tagging 2019” with more than 400 participants. After the challenge ended we further improved performance to 76.5% lwlrap setting a new state-of-the-art on this dataset.}},
  author       = {{Ebbers, Janek and Haeb-Umbach, Reinhold}},
  booktitle    = {{DCASE2019 Workshop, New York, USA}},
  title        = {{{Convolutional Recurrent Neural Network and Data Augmentation for Audio Tagging with Noisy Labels and Minimal Supervision}}},
  year         = {{2019}},
}

@inproceedings{15796,
  abstract     = {{In this paper we consider human daily activity recognition using an acoustic sensor network (ASN) which consists of nodes distributed in a home environment. Assuming that the ASN is permanently recording, the vast majority of recordings is silence. Therefore, we propose to employ a computationally efficient two-stage sound recognition system, consisting of an initial sound activity detection (SAD) and a subsequent sound event classification (SEC), which is only activated once sound activity has been detected. We show how a low-latency activity detector with high temporal resolution can be trained from weak labels with low temporal resolution. We further demonstrate the advantage of using spatial features for the subsequent event classification task.}},
  author       = {{Ebbers, Janek and Drude, Lukas and Haeb-Umbach, Reinhold and Brendel, Andreas and Kellermann, Walter}},
  booktitle    = {{CAMSAP 2019, Guadeloupe, West Indies}},
  title        = {{{Weakly Supervised Sound Activity Detection and Event Classification in Acoustic Sensor Networks}}},
  year         = {{2019}},
}