@inproceedings{11925,
  abstract     = {{In this paper we present a system for car navigation by fusing sensor data on an Android smartphone. The key idea is to use both the internal sensors of the smartphone (e.g., gyroscope) and sensor data from the car (e.g., speed information) to support navigation via GPS. To this end we employ a CAN-Bus-to-Bluetooth adapter to establish a wireless connection between the smartphone and the CAN-Bus of the car. On the smartphone a strapdown algorithm and an error-state Kalman filter are used to fuse the different sensor data streams. The experimental results show that the system is able to maintain higher positioning accuracy during GPS dropouts, thus improving the availability and reliability, compared to GPS-only solutions.}},
  author       = {{Walter, Oliver and Schmalenstroeer, Joerg and Engler, Andreas and Haeb-Umbach, Reinhold}},
  booktitle    = {{9th Workshop on Positioning Navigation and Communication (WPNC 2012)}},
  keywords     = {{Smartphone, navigation, sensor fusion}},
  title        = {{{Smartphone-Based Sensor Fusion for Improved Vehicular Navigation}}},
  year         = {{2012}},
}

@inproceedings{11721,
  author       = {{Bevermeier, Maik and Flanke, Stephan and Haeb-Umbach, Reinhold and Stehr, Jan}},
  booktitle    = {{International Workshop on Intelligent Transportation (WIT 2011)}},
  title        = {{{A Platform for efficient Supply Chain Management Support in Logistics}}},
  year         = {{2011}},
}

@inbook{11774,
  abstract     = {{In this contribution classification rules for HMM-based speech recognition in the presence of a mismatch between training and test data are presented. The observed feature vectors are regarded as corrupted versions of underlying and unobservable clean feature vectors, which have the same statistics as the training data. Optimal classification then consists of two steps. First, the posterior density of the clean feature vector, given the observed feature vectors, has to be determined, and second, this posterior is employed in a modified classification rule, which accounts for imperfect estimates. We discuss different variants of the classification rule and further elaborate on the estimation of the clean speech feature posterior, using conditional Bayesian estimation. It is shown that this concept is fairly general and can be applied to different scenarios, such as noisy or reverberant speech recognition.}},
  author       = {{Haeb-Umbach, Reinhold}},
  booktitle    = {{Robust Speech Recognition of Uncertain or Missing Data}},
  editor       = {{Haeb-Umbach, Reinhold and Kolossa, Dorothea}},
  publisher    = {{Springer}},
  title        = {{{Uncertainty Decoding and Conditional Bayesian Estimation}}},
  year         = {{2011}},
}

@inbook{11775,
  author       = {{Haeb-Umbach, Reinhold}},
  booktitle    = {{Baustelle Informationsgesellschaft und Universität heute}},
  publisher    = {{Ferdinand Schoeningh Verlag, Paderborn}},
  title        = {{{Können Computer sprechen und hören, sollen sie es überhaupt können? Sprachverarbeitung und ambiente Intelligenz}}},
  year         = {{2011}},
}

@article{11807,
  author       = {{Herbig, Tobias and Gerl, Franz and Minker, Wolfgang and Haeb-Umbach, Reinhold}},
  journal      = {{Evolving Systems}},
  number       = {{3}},
  pages        = {{199--214}},
  title        = {{{Adaptive Systems for Unsupervised Speaker Tracking and Speech Recognition}}},
  volume       = {{2}},
  year         = {{2011}},
}

@inbook{11843,
  abstract     = {{Employing automatic speech recognition systems in hands-free communication applications is accompanied by perfomance degradation due to background noise and, in particular, due to reverberation. These two kinds of distortion alter the shape of the feature vector trajectory extracted from the microphone signal and consequently lead to a discrepancy between training and testing conditions for the recognizer. In this chapter we present a feature enhancement approach aiming at the joint compensation of noise and reverberation to improve the performance by restoring the training conditions. For the enhancement we concentrate on the logarithmic mel power spectral coefficients as features, which are computed at an intermediate stage to obtain the widely used mel frequency cepstral coefficients. The proposed technique is based on a Bayesian framework, to attempt to infer the posterior distribution of the clean features given the observation of all past corrupted features. It exploits information from a priori models describing the dynamics of clean speech and noise-only feature vector trajectories as well as from an observation model relating the reverberant noisy to the clean features. The observation model relies on a simplified stochastic model of the room impulse response (RIR) between the speaker and the microphone, having only two parameters, namely RIR energy and reverberation time, which can be estimated from the captured microphone signal. The performance of the proposed enhancement technique is finally experimentally studied by means of recognition accuracy obtained for a connected digits recognition task under different noise and reverberation conditions using the Aurora~5 database.}},
  author       = {{Krueger, Alexander and Haeb-Umbach, Reinhold}},
  booktitle    = {{Robust Speech Recognition of Uncertain or Missing Data}},
  editor       = {{Haeb-Umbach, Reinhold and Kolossa, Dorothea}},
  publisher    = {{Springer}},
  title        = {{{A Model-Based Approach to Joint Compensation of Noise and Reverberation for Speech Recognition}}},
  year         = {{2011}},
}

@inproceedings{11845,
  abstract     = {{The paper proposes a modification of the standard maximum a posteriori (MAP) method for the estimation of the parameters of a Gaussian process for cases where the process is superposed by additive Gaussian observation errors of known variance. Simulations on artificially generated data demonstrate the superiority of the proposed method. While reducing to the ordinary MAP approach in the absence of observation noise, the improvement becomes the more pronounced the larger the variance of the observation noise. The method is further extended to track the parameters in case of non-stationary Gaussian processes.}},
  author       = {{Krueger, Alexander and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2011)}},
  keywords     = {{Gaussian processes, MAP-based estimation, maximum a posteriori method, maximum likelihood estimation, nonstationary Gaussian processes}},
  pages        = {{3596--3599}},
  title        = {{{MAP-based estimation of the parameters of non-stationary Gaussian processes from noisy observations}}},
  doi          = {{10.1109/ICASSP.2011.5946256}},
  year         = {{2011}},
}

@article{11850,
  abstract     = {{In this paper, we present a novel blocking matrix and fixed beamformer design for a generalized sidelobe canceler for speech enhancement in a reverberant enclosure. They are based on a new method for estimating the acoustical transfer function ratios in the presence of stationary noise. The estimation method relies on solving a generalized eigenvalue problem in each frequency bin. An adaptive eigenvector tracking utilizing the power iteration method is employed and shown to achieve a high convergence speed. Simulation results demonstrate that the proposed beamformer leads to better noise and interference reduction and reduced speech distortions compared to other blocking matrix designs from the literature.}},
  author       = {{Krueger, Alexander and Warsitz, Ernst and Haeb-Umbach, Reinhold}},
  journal      = {{IEEE Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{acoustical transfer function ratio, adaptive eigenvector tracking, array signal processing, beamformer design, blocking matrix, eigenvalues and eigenfunctions, eigenvector-based transfer function ratios estimation, generalized sidelobe canceler, interference reduction, iterative methods, power iteration method, reduced speech distortions, reverberant enclosure, reverberation, speech enhancement, stationary noise}},
  number       = {{1}},
  pages        = {{206--219}},
  title        = {{{Speech Enhancement With a GSC-Like Structure Employing Eigenvector-Based Transfer Function Ratios Estimation}}},
  doi          = {{10.1109/TASL.2010.2047324}},
  volume       = {{19}},
  year         = {{2011}},
}

@inbook{11856,
  abstract     = {{In this contribution, conditional Bayesian estimation employing a phase-sensitive observation model for noise robust speech recognition will be studied. After a review of speech recognition under the presence of corrupted features, termed uncertainty decoding, the estimation of the posterior distribution of the uncorrupted (clean) feature vector will be shown to be a key element of noise robust speech recognition. The estimation process will be based on three major components: an a priori model of the unobservable data, an observation model relating the unobservable data to the corrupted observation and an inference algorithm, finally allowing for a computationally tractable solution. Special stress will be laid on a detailed derivation of the phase-sensitive observation model and the required moments of the phase factor distribution. Thereby, it will not only be proven analytically that the phase factor distribution is non-Gaussian but also that all central moments can (approximately) be computed solely based on the used mel filter bank, finally rendering the moments independent of noise type and signal-to-noise ratio. The phase-sensitive observation model will then be incorporated into a model-based feature enhancement scheme and recognition experiments will be carried out on the Aurora~2 and Aurora~4 databases. The importance of incorporating phase factor information into the enhancement scheme is pointed out by all recognition results. Application of the proposed scheme under the derived uncertainty decoding framework further leads to significant improvements in both recognition tasks, eventually reaching the performance achieved with the ETSI advanced front-end.}},
  author       = {{Leutnant, Volker and Haeb-Umbach, Reinhold}},
  booktitle    = {{Robust Speech Recognition of Uncertain or Missing Data}},
  editor       = {{Haeb-Umbach, Reinhold and Kolossa, Dorothea}},
  publisher    = {{Springer}},
  title        = {{{Conditional Bayesian Estimation Employing a Phase-Sensitive Observation Model for Noise Robust Speech Recognition}}},
  year         = {{2011}},
}

@inproceedings{11866,
  abstract     = {{In this work, a splitting and weighting scheme that allows for splitting a Gaussian density into a Gaussian mixture density (GMM) is extended to allow the mixture components to be arranged along arbitrary directions. The parameters of the Gaussian mixture are chosen such that the GMM and the original Gaussian still exhibit equal central moments up to an order of four. The resulting mixtures{\rq} covariances will have eigenvalues that are smaller than those of the covariance of the original distribution, which is a desirable property in the context of non-linear state estimation, since the underlying assumptions of the extended K ALMAN filter are better justified in this case. Application to speech feature enhancement in the context of noise-robust automatic speech recognition reveals the beneficial properties of the proposed approach in terms of a reduced word error rate on the Aurora 2 recognition task.}},
  author       = {{Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2011}},
  title        = {{{A versatile Gaussian splitting approach to non-linear state estimation and its application to noise-robust ASR}}},
  year         = {{2011}},
}

@inproceedings{11911,
  abstract     = {{In this paper we address the problem of initial seed selection for frequency domain iterative blind speech separation (BSS) algorithms. The derivation of the seeding algorithm is guided by the goal to select samples which are likely to be caused by source activity and not by noise and at the same time originate from different sources. The proposed algorithm has moderate computational complexity and finds better seed values than alternative schemes, as is demonstrated by experiments on the database of the SiSEC2010 challenge.}},
  author       = {{Tran Vu, Dang Hai and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2011}},
  title        = {{{On Initial Seed Selection for Frequency Domain Blind Speech Separation}}},
  year         = {{2011}},
}

@book{11945,
  editor       = {{Kolossa, Dorothea and Haeb-Umbach, Reinhold}},
  publisher    = {{Springer}},
  title        = {{{Robust Speech Recognition of Uncertain or Missing Data --- Theory and Applications}}},
  year         = {{2011}},
}

@inproceedings{11889,
  abstract     = {{In this paper we propose to jointly consider Segmental Dynamic Time Warping and distance clustering for the unsupervised learning of acoustic events. As a result, the computational complexity increases only linearly with the dababase size compared to a quadratic increase in a sequential setup, where all pairwise SDTW distances between segments are computed prior to clustering. Further, we discuss options for seed value selection for clustering and show that drawing seeds with a probability proportional to the distance from the already drawn seeds, known as K-means++ clustering, results in a significantly higher probability of finding representatives of each of the underlying classes, compared to the commonly used draws from a uniform distribution. Experiments are performed on an acoustic event classification and an isolated digit recognition task, where on the latter the final word accuracy approaches that of supervised training.}},
  author       = {{Schmalenstroeer, Joerg and Bartek, Markus and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2011}},
  title        = {{{Unsupervised learning of acoustic events using dynamic time warping and hierarchical K-means++ clustering}}},
  year         = {{2011}},
}

@inproceedings{11896,
  abstract     = {{In this paper we propose a procedure for estimating the geometric configuration of an arbitrary acoustic sensor placement. It determines the position and the orientation of microphone arrays in 2D while locating a source by direction-of-arrival (DoA) estimation. Neither artificial calibration signals nor unnatural user activity are required. The problem of scale indeterminacy inherent to DoA-only observations is solved by adding time difference of arrival (TDOA) measurements. The geometry calibration method is numerically stable and delivers precise results in moderately reverberated rooms. Simulation results are confirmed by laboratory experiments.}},
  author       = {{Schmalenstroeer, Joerg and Jacob, Florian and Haeb-Umbach, Reinhold and Hennecke, Marius and Fink, Gernot A.}},
  booktitle    = {{Interspeech 2011}},
  title        = {{{Unsupervised Geometry Calibration of Acoustic Sensor Networks Using Source Correspondences}}},
  year         = {{2011}},
}

@inproceedings{9456,
  abstract     = {{In this paper we present our experimental results about classifying audio data into broad acoustic categories. The reverberated sound samples from indoor recordings are grouped into four classes, namely speech, music, acoustic events and noise. We investigated a total of 188 acoustic features and achieved for the best configuration a classification accuracy better than 98\%. This was achieved by a 42-dimensional feature vector consisting of Mel-Frequency Cepstral Coefficients, an autocorrelation feature and so-called track features that measure the length of ''traces'' of high energy in the spectrogram. We also found a 4-feature configuration with a classification rate of about 90\% allowing for broad acoustic category classification with low computational effort.}},
  author       = {{Schmalenstroeer, Joerg and Bartek, Markus and Haeb-Umbach, Reinhold}},
  booktitle    = {{37. Deutsche Jahrestagung fuer Akustik (DAGA 2011)}},
  title        = {{{Investigations into Features for Robust Classification into Broad Acoustic Categories}}},
  year         = {{2011}},
}

@inproceedings{11726,
  abstract     = {{In this paper we present a robust location estimation algorithm especially focused on the accuracy in vertical position. A loosely-coupled error state space Kalman filter, which fuses sensor data of an Inertial Measurement Unit and the output of a Global Positioning System device, is augmented by height information from an altitude measurement unit. This unit consists of a barometric altimeter whose output is fused with topographic map information by a Kalman filter to provide robust information about the current vertical user position. These data replace the less reliable vertical position information provided the GPS device. It is shown that typical barometric errors like thermal divergences and fluctuations in the pressure due to changing weather conditions can be compensated by the topographic map information and the barometric error Kalman filter. The resulting height information is shown not only to be more reliable than height information provided by GPS. It also turns out that it leads to better attitude and thus better overall localization estimation accuracy due to the coupling of spatial orientations via the Direct Cosine Matrix. Results are presented both for artificially generated and field test data, where the user is moving by car.}},
  author       = {{Bevermeier, Maik and Walter, Oliver and Peschke, Sven and Haeb-Umbach, Reinhold}},
  booktitle    = {{7th Workshop on Positioning Navigation and Communication (WPNC 2010)}},
  keywords     = {{altitude measurement unit, barometers, barometric altimeter, barometric error Kalman filter, barometric height estimation, direct cosine matrix, global positioning system, Global Positioning System, GPS device, height information, height measurement, inertial measurement unit, Kalman filters, loosely-coupled error state space Kalman filter, loosely-coupled Kalman-filter, map matching, robust information, robust location estimation, sensor fusion, topographic map information, vertical user position}},
  pages        = {{128--134}},
  title        = {{{Barometric height estimation combined with map-matching in a loosely-coupled Kalman-filter}}},
  doi          = {{10.1109/WPNC.2010.5650745}},
  year         = {{2010}},
}

@article{11846,
  abstract     = {{In this paper, we present a new technique for automatic speech recognition (ASR) in reverberant environments. Our approach is aimed at the enhancement of the logarithmic Mel power spectrum, which is computed at an intermediate stage to obtain the widely used Mel frequency cepstral coefficients (MFCCs). Given the reverberant logarithmic Mel power spectral coefficients (LMPSCs), a minimum mean square error estimate of the clean LMPSCs is computed by carrying out Bayesian inference. We employ switching linear dynamical models as an a priori model for the dynamics of the clean LMPSCs. Further, we derive a stochastic observation model which relates the clean to the reverberant LMPSCs through a simplified model of the room impulse response (RIR). This model requires only two parameters, namely RIR energy and reverberation time, which can be estimated from the captured microphone signal. The performance of the proposed enhancement technique is studied on the AURORA5 database and compared to that of constrained maximum-likelihood linear regression (CMLLR). It is shown by experimental results that our approach significantly outperforms CMLLR and that up to 80\% of the errors caused by the reverberation are recovered. In addition to the fact that the approach is compatible with the standard MFCC feature vectors, it leaves the ASR back-end unchanged. It is of moderate computational complexity and suitable for real time applications.}},
  author       = {{Krueger, Alexander and Haeb-Umbach, Reinhold}},
  journal      = {{IEEE Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{ASR, AURORA5 database, automatic speech recognition, Bayesian inference, belief networks, CMLLR, computational complexity, constrained maximum likelihood linear regression, least mean squares methods, LMPSC computation, logarithmic Mel power spectrum, maximum likelihood estimation, Mel frequency cepstral coefficients, MFCC feature vectors, microphone signal, minimum mean square error estimation, model-based feature enhancement, regression analysis, reverberant speech recognition, reverberation, RIR energy, room impulse response, speech recognition, stochastic observation model, stochastic processes}},
  number       = {{7}},
  pages        = {{1692--1707}},
  title        = {{{Model-Based Feature Enhancement for Reverberant Speech Recognition}}},
  doi          = {{10.1109/TASL.2010.2049684}},
  volume       = {{18}},
  year         = {{2010}},
}

@inproceedings{11857,
  abstract     = {{Traditionally, ASR systems are based on hidden Markov models with Gaussian mixtures modelling the state-conditioned feature distribution. The inherent assumption of conditional independence, stating that a feature's likelihood solely depends on the current HMM state, makes the search computationally tractable, nevertheless has also been identified to be a major reason for the lack of robustness of such systems. Linear dynamic models have been proposed to overcome this weakness by employing a hidden dynamic state process underlying the observed features. Though performance of linear dynamic models on continuous speech/phone recognition tasks has been shown to be superior to that of equivalent static models, this approach still cannot compete with the established acoustic models. In this paper we consider the combination of hidden Markov models based on Gaussian mixture densities (GMM-HMMs) and linear dynamic models (LDMs) as the acoustic model for automatic speech recognition systems. In doing so, the individual strengths of both models, i.e. the modelling of long-term temporal dependencies by the GMM-HMM and the direct modelling of statistical dependencies between consecutive feature vectors by the LDM, are exploited. Phone classification experiments conducted on the TIMIT database indicate the prospective use of this approach for the application to continuous speech recognition.}},
  author       = {{Leutnant, Volker and Haeb-Umbach, Reinhold}},
  booktitle    = {{36. Deutsche Jahrestagung fuer Akustik (DAGA 2010)}},
  title        = {{{Options for Modelling Temporal Statistical Dependencies in an Acoustic Model for ASR}}},
  year         = {{2010}},
}

@inproceedings{11858,
  abstract     = {{Linear dynamic models (LDMs) have been shown to be a viable alternative to hidden Markov models (HMMs) on small-vocabulary recognition tasks, such as phone classification. In this paper we investigate various statistical model combination approaches for a hybrid HMM-LDM recognizer, resulting in a phone classification performance that outperforms the best individual classifier. Further, we report on continuous speech recognition experiments on the AURORA4 corpus, where the model combination is carried out on wordgraph rescoring. While the hybrid system improves the HMM system in the case of monophone HMMs, the performance of the triphone HMM model could not be improved by monophone LDMs, asking for the need to introduce context-dependency also in the LDM model inventory.}},
  author       = {{Leutnant, Volker and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2010}},
  title        = {{{On the Exploitation of Hidden Markov Models and Linear Dynamic Models in a Hybrid Decoder Architecture for Continuous Speech Recognition}}},
  year         = {{2010}},
}

@inproceedings{11887,
  abstract     = {{We describe an algorithm that performs regularized non-negative matrix factorization (NMF) to find independent components in non-negative data. Previous techniques proposed for this purpose require the data to be grounded, with support that goes down to 0 along each dimension. In our work, this requirement is eliminated. Based on it, we present a technique to find a low-dimensional decomposition of spectrograms by casting it as a problem of discovering independent non-negative components from it. The algorithm itself is implemented as regularized non-negative matrix factorization (NMF). Unlike other ICA algorithms, this algorithm computes the mixing matrix rather than an unmixing matrix. This algorithm provides a better decomposition than standard NMF when the underlying sources are independent. It makes better use of additional observation streams than previous non-negative ICA algorithms.}},
  author       = {{Raj, Bhiksha and Wilson, Kevin W. and Krueger, Alexander and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2010}},
  title        = {{{Ungrounded Independent Non-Negative Factor Analysis}}},
  year         = {{2010}},
}

