@inproceedings{11912,
  abstract     = {{In this contribution we provide a unified treatment of blind source separation (BSS) and noise suppression, two tasks which have traditionally been considered different and for which quite different techniques have been developed. Exploiting the sparseness of the sources in the short time frequency domain and using a probabilistic model which accounts for the presence of additive noise and which captures the spatial information of the multi-channel recording, a speech enhancement system is developed which suppresses noise and simultaneously separates speakers in case multiple speakers are active. Source activity estimation and model parameter estimation form the E-step and the M-step of the Expectation Maximization algorithm, respectively. Experimental results obtained on the dataset of the Signal Separation Evaluation Campaign 2010 demonstrate the effectiveness of the proposed system.}},
  author       = {{Tran Vu, Dang Hai and Haeb-Umbach, Reinhold}},
  booktitle    = {{International Workshop on Acoustic Echo and Noise Control (IWAENC 2010)}},
  title        = {{{An EM Approach to Integrated Multichannel Speech Separation and Noise Suppression}}},
  year         = {{2010}},
}

@inproceedings{11913,
  abstract     = {{In this paper we propose to employ directional statistics in a complex vector space to approach the problem of blind speech separation in the presence of spatially correlated noise. We interpret the values of the short time Fourier transform of the microphone signals to be draws from a mixture of complex Watson distributions, a probabilistic model which naturally accounts for spatial aliasing. The parameters of the density are related to the a priori source probabilities, the power of the sources and the transfer function ratios from sources to sensors. Estimation formulas are derived for these parameters by employing the Expectation Maximization (EM) algorithm. The E-step corresponds to the estimation of the source presence probabilities for each time-frequency bin, while the M-step leads to a maximum signal-to-noise ratio (MaxSNR) beamformer in the presence of uncertainty about the source activity. Experimental results are reported for an implementation in a generalized sidelobe canceller (GSC) like spatial beamforming configuration for 3 speech sources with significant coherent noise in reverberant environments, demonstrating the usefulness of the novel modeling framework.}},
  author       = {{Tran Vu, Dang Hai and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2010)}},
  keywords     = {{array signal processing, blind source separation, blind speech separation, complex vector space, complex Watson distribution, directional statistics, expectation-maximisation algorithm, expectation maximization algorithm, Fourier transform, Fourier transforms, generalized sidelobe canceller, interference suppression, maximum signal-to-noise ratio beamformer, microphone signal, probabilistic model, spatial aliasing, spatial beamforming configuration, speech enhancement, statistical distributions}},
  pages        = {{241--244}},
  title        = {{{Blind speech separation employing directional statistics in an Expectation Maximization framework}}},
  doi          = {{10.1109/ICASSP.2010.5495994}},
  year         = {{2010}},
}

@article{11892,
  abstract     = {{For an environment to be perceived as being smart, contextual information has to be gathered to adapt the system's behavior and its interface towards the user. Being a rich source of context information speech can be acquired unobtrusively by microphone arrays and then processed to extract information about the user and his environment. In this paper, a system for joint temporal segmentation, speaker localization, and identification is presented, which is supported by face identification from video data obtained from a steerable camera. Special attention is paid to latency aspects and online processing capabilities, as they are important for the application under investigation, namely ambient communication. It describes the vision of terminal-less, session-less and multi-modal telecommunication with remote partners, where the user can move freely within his home while the communication follows him. The speaker diarization serves as a context source, which has been integrated in a service-oriented middleware architecture and provided to the application to select the most appropriate I/O device and to steer the camera towards the speaker during ambient communication.}},
  author       = {{Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  journal      = {{IEEE Journal of Selected Topics in Signal Processing}},
  keywords     = {{audio streaming, audio visual data streaming, context information speech, face identification, face recognition, image segmentation, middleware, multimodal telecommunication, online diarization, service oriented middleware architecture, sessionless telecommunication, software architecture, speaker identification, speaker localization, speaker recognition, steerable camera, telecommunication computing, temporal segmentation, terminal-less telecommunication, video streaming}},
  number       = {{5}},
  pages        = {{845--856}},
  title        = {{{Online Diarization of Streaming Audio-Visual Data for Smart Environments}}},
  doi          = {{10.1109/JSTSP.2010.2050519}},
  volume       = {{4}},
  year         = {{2010}},
}

@inproceedings{11723,
  abstract     = {{In this paper we present a novel vehicle tracking algorithm, which is based on multi-level sensor fusion of GPS (global positioning system) with Inertial Measurement Unit sensor data. It is shown that the robustness of the system to temporary dropouts of the GPS signal, which may occur due to limited visibility of satellites in narrow street canyons or tunnels, is greatly improved by sensor fusion. We further demonstrate how the observation and state noise covariances of the employed Kalman filters can be estimated alongside the filtering by an application of the Expectation-Maximization algorithm. The proposed time-variant multi-level Kalman filter is shown to outperform an Interacting Multiple Model approach while at the same time being computationally less demanding.}},
  author       = {{Bevermeier, Maik and Peschke, Sven and Haeb-Umbach, Reinhold}},
  booktitle    = {{6th Workshop on Positioning Navigation and Communication (WPNC 2009)}},
  keywords     = {{covariance matrices, expectation-maximisation algorithm, expectation-maximization algorithm, global positioning system, Global Positioning System, GPS, inertial measurement unit, interacting multiple model approach, Kalman filters, multilevel sensor fusion, narrow street canyons, narrow tunnels, online parameter estimation, parameter estimation, road vehicles, robust vehicle localization, sensor fusion, state noise covariances, time-variant multilevel Kalman filter, vehicle tracking algorithm}},
  pages        = {{235--242}},
  title        = {{{Robust vehicle localization based on multi-level sensor fusion and online parameter estimation}}},
  doi          = {{10.1109/WPNC.2009.4907833}},
  year         = {{2009}},
}

@inproceedings{11724,
  abstract     = {{In this paper we present a novel vehicle tracking method which is based on multi-stage Kalman filtering of GPS and IMU sensor data. After individual Kalman filtering of GPS and IMU measurements the estimates of the orientation of the vehicle are combined in an optimal manner to improve the robustness towards drift errors. The tracking algorithm incorporates the estimation of time-variant covariance parameters by using an iterative block Expectation-Maximization algorithm to account for time-variant driving conditions and measurement quality. The proposed system is compared to an interacting multiple model approach (IMM) and achieves improved localization accuracy at lower computational complexity. Furthermore we show how the joint parameter estimation and localizaiton can be conducted with streaming input data to be able to track vehicles in a real driving environment.}},
  author       = {{Bevermeier, Maik and Peschke, Sven and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE 69th Vehicular Technology Conference (VTC 2009 Spring)}},
  keywords     = {{computational complexity, expectation-maximisation algorithm, Global Positioning System, inertial measurement unit, inertial navigation, interacting multiple model, iterative block expectation-maximization algorithm, Kalman filters, multi-stage Kalman filter, parameter estimation, road vehicles, vehicle positioning, vehicle tracking}},
  pages        = {{1--5}},
  title        = {{{Joint Parameter Estimation and Tracking in a Multi-Stage Kalman Filter for Vehicle Positioning}}},
  doi          = {{10.1109/VETECS.2009.5073634}},
  year         = {{2009}},
}

@inproceedings{11725,
  author       = {{Bevermeier, Maik and Peschke, Sven and Haeb-Umbach, Reinhold}},
  booktitle    = {{DGON Navigationskonvent 2009}},
  title        = {{{Eine Plattform fuer Mehrwertdienste im Bereich Logistik - Drahtlose Fahrzeug- und Laderaumueberwachung fuer LKW mit Hilfe einer Maut-On-Board Unit}}},
  year         = {{2009}},
}

@inproceedings{11847,
  abstract     = {{In this paper we present a new feature space dereverberation technique for automatic speech recognition. We derive an expression for the dependence of the reverberant speech features in the log-mel spectral domain on the non-reverberant speech features and the room impulse response. The obtained observation model is used for a model based speech enhancement based on Kalman filtering. The performance of the proposed enhancement technique is studied on the AURORA5 database. In our currently best configuration, which includes uncertainty decoding, the number of recognition errors is approximately halved compared to the recognition of unprocessed speech.}},
  author       = {{Krueger, Alexander and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2009}},
  title        = {{{Model based feature enhancement for automatic speech recognition in reverberant environments}}},
  year         = {{2009}},
}

@inproceedings{11859,
  abstract     = {{In this paper we present an Uncertainty Decoding rule which exploits feature reliability information and interframe correlation for noise robust speech recognition. The reliability information can be obtained either from conditional Bayesian estimation, where speech and noise feature vectors are tracked jointly, or by augmenting conventional point estimation methods with heuristics about the estimator's reliability. Experimental results on the AURORA2 database demonstrate on the one hand that Uncertainty Decoding improves recognition performance, while on the other hand it is seen that the severe approximations needed to arrive at computationally tractable solutions have their noticable impact on recognition performance.}},
  author       = {{Leutnant, Volker and Haeb-Umbach, Reinhold}},
  booktitle    = {{International Conference on Acoustics (NAG/DAGA 2009)}},
  title        = {{{On the Estimation and Use of Feature Reliability Information for Noise Robust Speech Recognition}}},
  year         = {{2009}},
}

@inproceedings{11860,
  abstract     = {{In this paper we present an analytic derivation of the moments of the phase factor between clean speech and noise cepstral or log-mel-spectral feature vectors. The development shows, among others, that the probability density of the phase factor is of sub-Gaussian nature and that it is independent of the noise type and the signal-to-noise ratio, however dependent on the mel filter bank index. Further we show how to compute the contribution of the phase factor to both the mean and the vari- ance of the noisy speech observation likelihood, which relates the speech and noise feature vectors to those of noisy speech. The resulting phase-sensitive observation model is then used in model-based speech feature enhancement, leading to significant improvements in word accuracy on the AURORA2 database.}},
  author       = {{Leutnant, Volker and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2009}},
  title        = {{{An analytic derivation of a phase-sensitive observation model for noise robust speech recognition}}},
  year         = {{2009}},
}

@inproceedings{11881,
  abstract     = {{A combination of GPS (global positioning system) and INS (inertial navigation system) is known to provide high precision and highly robust vehicle localization. Notably during times when the GPS signal has a poor quality, e.g. due to the lack of a sufficiently large number of visible satellites, the INS, which may consist of a gyroscope and an odometer, will lead to improved positioning accuracy. In this paper we show how velocity information obtained from GSM (global system for mobile communications) signalling, rather than from a tachometer, can be used together with a gyroscope sensor to support localization in the presence of temporarily unavailable GPS data. We propose a sensor fusion system architecture and present simulation results that show the effectiveness of this approach.}},
  author       = {{Peschke, Sven and Bevermeier, Maik and Haeb-Umbach, Reinhold}},
  booktitle    = {{6th Workshop on Positioning Navigation and Communication (WPNC 2009)}},
  keywords     = {{cellular radio, distance measurement, global positioning system, Global Positioning System, global system for mobile communications, GPS positioning approach, GSM velocity, gyroscopes, gyroscope sensor, inertial navigation, inertial navigation system, odometer, sensor fusion system architecture, sensors}},
  pages        = {{195--202}},
  title        = {{{A GPS positioning approach exploiting GSM velocity estimates}}},
  doi          = {{10.1109/WPNC.2009.4907827}},
  year         = {{2009}},
}

@inproceedings{11882,
  author       = {{Peschke, Sven and Bevermeier, Maik and Haeb-Umbach, Reinhold}},
  booktitle    = {{DGON Navigationskonvent 2009}},
  title        = {{{Verbesserung von GPS-basierter Ortung durch GSM-Geschwindigkeitsschaetzungen}}},
  year         = {{2009}},
}

@article{11937,
  abstract     = {{In automatic speech recognition, hidden Markov models (HMMs) are commonly used for speech decoding, while switching linear dynamic models (SLDMs) can be employed for a preceding model-based speech feature enhancement. In this paper, these model types are combined in order to obtain a novel iterative speech feature enhancement and recognition architecture. It is shown that speech feature enhancement with SLDMs can be improved by feeding back information from the HMM to the enhancement stage. Two different feedback structures are derived. In the first, the posteriors of the HMM states are used to control the model probabilities of the SLDMs, while in the second they are employed to directly influence the estimate of the speech feature distribution. Both approaches lead to improvements in recognition accuracy both on the AURORA2 and AURORA4 databases compared to non-iterative speech feature enhancement with SLDMs. It is also shown that a combination with uncertainty decoding further enhances performance.}},
  author       = {{Windmann, Stefan and Haeb-Umbach, Reinhold}},
  journal      = {{IEEE Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{AURORA2 databases, AURORA4 databases, automatic speech recognition, feedback structures, hidden Markov models, HMM, iterative methods, iterative speech feature enhancement, model probabilities, speech decoding, speech enhancement, speech feature distribution, speech recognition, switching linear dynamic models}},
  number       = {{5}},
  pages        = {{974--984}},
  title        = {{{Approaches to Iterative Speech Feature Enhancement and Recognition}}},
  doi          = {{10.1109/TASL.2009.2014894}},
  volume       = {{17}},
  year         = {{2009}},
}

@article{11938,
  abstract     = {{In this paper, parameter estimation of a state-space model of noise or noisy speech cepstra is investigated. A blockwise EM algorithm is derived for the estimation of the state and observation noise covariance from noise-only input data. It is supposed to be used during the offline training mode of a speech recognizer. Further a sequential online EM algorithm is developed to adapt the observation noise covariance on noisy speech cepstra at its input. The estimated parameters are then used in model-based speech feature enhancement for noise-robust automatic speech recognition. Experiments on the AURORA4 database lead to improved recognition results with a linear state model compared to the assumption of stationary noise.}},
  author       = {{Windmann, Stefan and Haeb-Umbach, Reinhold}},
  journal      = {{IEEE Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{AURORA4 database, blockwise EM algorithm, covariance analysis, linear state model, noise covariance, noise-robust automatic speech recognition, noisy speech cepstra, offline training mode, parameter estimation, speech recognition, speech recognition equipment, speech recognizer, state-space methods, state-space model}},
  number       = {{8}},
  pages        = {{1577--1590}},
  title        = {{{Parameter Estimation of a State-Space Model of Noise for Robust Speech Recognition}}},
  doi          = {{10.1109/TASL.2009.2023172}},
  volume       = {{17}},
  year         = {{2009}},
}

@inproceedings{11900,
  author       = {{Schmalenstroeer, Joerg and Leutnant, Volker and Haeb-Umbach, Reinhold}},
  booktitle    = {{1st International Workshop on Distributed Computing in Ambient Environments within 32nd Annual Conference on Artificial Intelligence}},
  title        = {{{Audio-Visual Data Processing for Ambient Communication}}},
  year         = {{2009}},
}

@inproceedings{11806,
  abstract     = {{Microphone arrays represent the basis for many challenging acoustic sensing tasks. The accuracy of techniques like beamforming directly depends on a precise knowledge of the relative positions of the sensors used. Unfortunately, for certain use cases manually measuring the geometry of an array is not feasible due to practical constraints. In this paper we present an approach to unsupervised shape calibration of microphone array networks. We developed a hierarchical procedure that first performs local shape calibration based on coherence analysis and then employs SRP-PHAT in a network calibration method. Practical experiments demonstrate the effectiveness of our approach especially for highly reverberant acoustic environments.}},
  author       = {{Hennecke, Marius and Ploetz, Thomas and Fink, Gernot A. and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE/SP 15th Workshop on Statistical Signal Processing (SSP 2009)}},
  keywords     = {{acoustic sensing tasks, array geometry, calibration, coherence analysis, hierarchical procedure, local shape calibration, microphone array networks, microphone arrays, network calibration method, sensor arrays, SRP-PHAT, unsupervised shape calibration}},
  pages        = {{257--260}},
  title        = {{{A hierarchical approach to unsupervised shape calibration of microphone array networks}}},
  doi          = {{10.1109/SSP.2009.5278589}},
  year         = {{2009}},
}

@inproceedings{11899,
  abstract     = {{In this paper we present a system for identifying and localizingspeakers using distant microphone arrays and a steerablepan-tilt-zoom camera. Audio and video streams are processedin real-time to obtain the diarization information {grqq}who speakswhen and where'' with low latency to be used in advanced videoconferencing systems or user-adaptive interfaces. A key featureof the proposed system is to first glean information about thespeaker{\rq}s location and identity from the audio and visual datastreams separately and then to fuse these data in a probabilisticframework employing the Viterbi algorithm. Here, visual evidenceof a person is utilized through a priori state probabilities,while location and speaker change information are employedvia time-variant transition probablities. Experiments show thatvideo information yields a substantial improvement comparedto pure audio-based diarization.}},
  author       = {{Schmalenstroeer, Joerg and Kelling, Martin and Leutnant, Volker and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2009}},
  title        = {{{Fusing Audio and Video Information for Online Speaker Diarization}}},
  year         = {{2009}},
}

@article{11776,
  abstract     = {{The term uncertainty decoding has been phrased for a class of robustness enhancing algorithms in automatic speech recognition that replace point estimates and plug-in rules by posterior densities and optimal decision rules. While uncertainty can be incorporated in the model domain, in the feature domain, or even in both, we concentrate here on feature domain approaches as they tend to be computationally less demanding. We derive optimal decision rules in the presence of uncertain observations and discuss simplifications which result in computationally efficient realizations. The usefulness of the presented statistical framework is then exemplified for two types of realworld problems: The first is improving the robustness of speech recognition towards incomplete or corrupted feature vectors due to a lossy communication link between the speech capturing front end and the backend recognition engine. And the second is the well-known and extensively studied issue of improving the robustness of the recognizer towards environmental noise.}},
  author       = {{Haeb-Umbach, Reinhold}},
  journal      = {{2008 ITG Conference on Voice Communication (SprachKommunikation)}},
  pages        = {{1--7}},
  title        = {{{Uncertainty Decoding in Automatic Speech Recognition}}},
  year         = {{2008}},
}

@inbook{11789,
  abstract     = {{In distributed and network speech recognition the actual recognition task is not carried out on the user{\rq}s terminal but rather on a remote server in the network. While there are good reasons for doing so, a disadvantage of this client-server architecture is clearly that the communication medium may introduce errors, which then impairs speech recognition accuracy. Even sophisticated channel coding cannot completely prevent the occurrence of residual bit errors in the case of temporarily adverse channel conditions, and in packet-oriented transmission packets of data may arrive too late for the given real-time constraints and have to be declared lost. The goal of error concealment is to reduce the detrimental effect that such errors may induce on the recipient of the transmitted speech signal by exploiting residual redundancy in the bit stream at the source coder output. In classical speech transmission a human is the recipient, and erroneous data are reconstructed so as to reduce the subjectively annoying effect of corrupted bits or lost packets. Here, however, a statistical classifier is at the receiving end, which can benefit from knowledge about the quality of the reconstruction. In this book chapter we show how the classical Bayesian decision rule needs to be modified to account for uncertain features, and illustrate how the required feature posterior density can be estimated in the case of distributed speech recognition. Some other techniques for error concealment can be related to this approach. Experimental results are given for both a small and a medium vocabulary recognition task and both for a channel exhibiting bit errors and a packet erasure channel.}},
  author       = {{Haeb-Umbach, Reinhold and Ion, Valentin}},
  booktitle    = {{Automatic Speech Recognition on Mobile Devices and over Communication Networks}},
  editor       = {{Lindenberg, Borge and Tan, Zheng-Hua}},
  pages        = {{187--210}},
  publisher    = {{Springer}},
  title        = {{{Error Concealement}}},
  volume       = {{Advances in Computer Vision and Pattern Recognition}},
  year         = {{2008}},
}

@article{11820,
  abstract     = {{In this paper, we derive an uncertainty decoding rule for automatic speech recognition (ASR), which accounts for both corrupted observations and inter-frame correlation. The conditional independence assumption, prevalent in hidden Markov model-based ASR, is relaxed to obtain a clean speech posterior that is conditioned on the complete observed feature vector sequence. This is a more informative posterior than one conditioned only on the current observation. The novel decoding is used to obtain a transmission-error robust remote ASR system, where the speech capturing unit is connected to the decoder via an error-prone communication network. We show how the clean speech posterior can be computed for communication links being characterized by either bit errors or packet loss. Recognition results are presented for both distributed and network speech recognition, where in the latter case common voice-over-IP codecs are employed.}},
  author       = {{Ion, Valentin and Haeb-Umbach, Reinhold}},
  journal      = {{IEEE Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{automatic speech recognition, bit errors, codecs, communication links, corrupted observations, decoding, distributed speech recognition, error-prone communication network, feature vector sequence, hidden Markov model-based ASR, hidden Markov models, inter-frame correlation, Internet telephony, network speech recognition, packet loss, speech posterior, speech recognition, transmission error robust speech recognition, uncertainty decoding, voice-over-IP codecs}},
  number       = {{5}},
  pages        = {{1047--1060}},
  title        = {{{A Novel Uncertainty Decoding Rule With Applications to Transmission Error Robust Speech Recognition}}},
  doi          = {{10.1109/TASL.2008.925879}},
  volume       = {{16}},
  year         = {{2008}},
}

@article{11821,
  abstract     = {{This paper addresses the robustness of automatic speech recognition to environmental noise. In order to account for reliability of the clean feature estimate we employ the feature posterior density conditioned on observed noisy features to perform uncertainty decoding. We investigate two approaches to estimate the posterior using a discrete feature space, first conditioning only on the current observation, and second on the whole feature sequence of an utterance. Experiments with Aurora 2 showed that the latter provides slightly better performance, as it allows for exploiting the temporal correlations between consecutive features.}},
  author       = {{Ion, Valentin and Haeb-Umbach, Reinhold}},
  journal      = {{2008 ITG Conference on Voice Communication (SprachKommunikation)}},
  pages        = {{1--4}},
  title        = {{{Investigations into Uncertainty Decoding Employing a Discrete Feature Space for Noise Robust Automatic Speech Recognition}}},
  year         = {{2008}},
}

