@inproceedings{11824,
  abstract     = {{Soft-feature based speech recognition, which is an example of uncertainty decoding, has been proven to be a robust error mitigation method for distributed speech recognition over wireless channels exhibiting bit errors. In this paper we extend this concept to packet-oriented transmissions. The a posteriori probability density function of the lost feature vector, given the closest received neighbours, is computed. In the experiments, the nearest frame repetition, which is shown to be equivalent to the MAP estimate, outperforms the MMSE estimate for long bursts. Taking the variance into account at the speech recognition stage results in superior performance compared to classical schemes using point estimates. A computationally and memory efficient implementation of the proposed packet loss compensation scheme based on table lookup is presented}},
  author       = {{Ion, Valentin and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2006)}},
  keywords     = {{distributed speech recognition, least mean squares methods, MAP estimate, maximum likelihood estimation, MMSE estimate, packet loss compensation scheme, packet switched communication, posteriori probability density function, robust error mitigation method, soft-features, speech recognition, table lookup, voice communication, wireless channels}},
  pages        = {{I}},
  title        = {{{An Inexpensive Packet Loss Compensation Scheme for Distributed Speech Recognition Based on Soft-Features}}},
  doi          = {{10.1109/ICASSP.2006.1659984}},
  volume       = {{1}},
  year         = {{2006}},
}

@article{11825,
  abstract     = {{In this paper, we propose an enhanced error concealment strategy at the server side of a distributed speech recognition (DSR) system, which is fully compatible with the existing DSR standard. It is based on a Bayesian approach, where the a posteriori probability density of the error-free feature vector is computed, given all received feature vectors which are possibly corrupted by transmission errors. Rather than computing a point estimate, such as the MMSE estimate, and plugging it into the Bayesian decision rule, we employ uncertainty decoding, which results in an integration over the uncertainty in the feature domain. In a typical scenario the communication between the thin client, often a mobile device, and the recognition server spreads across heterogeneous networks. Both bit errors on circuit-switched links and lost data packets on IP connections are mitigated by our approach in a unified manner. The experiments reveal improved robustness both for small- and large-vocabulary recognition tasks.}},
  author       = {{Ion, Valentin and Haeb-Umbach, Reinhold}},
  journal      = {{Speech Communication}},
  keywords     = {{Channel error robustness, Distributed speech recognition, Soft features, Uncertainty decoding}},
  number       = {{11}},
  pages        = {{1435--1446}},
  title        = {{{Uncertainty decoding for distributed speech recognition over error-prone networks}}},
  doi          = {{10.1016/j.specom.2006.03.007}},
  volume       = {{48}},
  year         = {{2006}},
}

@inproceedings{11943,
  abstract     = {{A marginalized particle filter is proposed for performing single channel speech enhancement with a non-linear dynamic state model. The system consists of a particle filter for tracking line spectral pair (LSP) parameters and a Kalman filter per particle for speech enhancement. The state model for the LSPs has been learnt on clean speech training data. In our approach parameters and speech samples are processed at different time scales by assuming the parameters to be constant for small blocks of data. Further enhancement is obtained by an iteration which can be applied on these small blocks. The experiments show that similar SNR gains are obtained as with the Kalman-LM-iterative algorithm. However better values of the noise level and the log-spectral distance are achieved}},
  author       = {{Windmann, Stefan and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2006)}},
  keywords     = {{clean speech training data, iterative methods, iterative speech enhancement, Kalman filter, Kalman filters, Kalman-LM-iterative algorithm, line spectral pair parameters, log-spectral distance, marginalized particle filter, noise level, nonlinear dynamic state speech model, particle filtering (numerical methods), single channel speech enhancement, SNR gains, speech enhancement, speech samples}},
  pages        = {{I}},
  title        = {{{Iterative Speech Enhancement using a Non-Linear Dynamic State Model of Speech and its Parameters}}},
  doi          = {{10.1109/ICASSP.2006.1660058}},
  volume       = {{1}},
  year         = {{2006}},
}

@inproceedings{11828,
  abstract     = {{In this paper we present a comparison of the recently proposed Soft-Feature Distributed Speech Recognition (SFDSR) with the two evaluated candidate codecs for Speech Enabled Services over wireless networks: Adaptive Multirate Codec (AMR) and the ETSI Extended Advanced Front-End for Distributed Speech Recognition (XAFE). It is shown that SFDSR achieves the best recognition performance on a simulated GSM transmission, followed by XAFE and AMR.We also present some new results concerning SFDSR which demonstrate the versatility of the approach. Further, a simple method is introduced which considerably reduces the computational effort.}},
  author       = {{Ion, Valentin and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2005)}},
  keywords     = {{adaptive codes, adaptive multirate codec, AMR, distributed speech recognition, ETSI, extended advanced front-end, recognition performance, SFDSR, simulated GSM transmission, soft-feature distributed speech recognition, speech codecs, speech coding, speech recognition, variable rate codes, XAFE}},
  pages        = {{333--336}},
  title        = {{{A Comparison of Soft-Feature Distributed Speech Recognition with Candidate Codecs for Speech Enabled Mobile Services}}},
  doi          = {{10.1109/ICASSP.2005.1415118}},
  volume       = {{1}},
  year         = {{2005}},
}

@inproceedings{11931,
  abstract     = {{The paper is concerned with binaural signal processing for a bimodal human-robot interface with hearing and vision. The two microphone signals are processed to obtain an enhanced single-channel input signal for the subsequent speech recognizer and to localize the acoustic source, an important information for establishing a natural human-robot communication. We utilize a robust adaptive algorithm for filter-and-sum beamforming (FSB) and extract speaker direction information from the resulting FIR filter coefficients. Further, particle filtering is applied which conducts a nonlinear Bayesian tracking of speaker movement. Good location accuracy can be achieved even in highly reverberant environments. The results obtained outperform the conventional generalized cross correlation (GCC) method.}},
  author       = {{Warsitz, Ernst and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE Workshop on Multimedia Signal Processing (MMSP 2004)}},
  keywords     = {{bimodal human-robot interface, binaural signal processing, enhanced single-channel input signal, filter-and-sum beamforming, filtering theory, FIR filter coefficient, generalized cross correlation method, microphones, microphone signal, nonlinear Bayesian tracking, particle filtering, robust adaptive algorithm, robust speaker direction estimation, signal processing, speech enhancement, speech recognition, speech recognizer, user interfaces}},
  pages        = {{367--370}},
  title        = {{{Robust speaker direction estimation with particle filtering}}},
  doi          = {{10.1109/MMSP.2004.1436569}},
  year         = {{2004}},
}

@inproceedings{39053,
  abstract     = {{Portable devices come with different limitations in user interaction like limited display size, small keyboard, and different sorts of input and output capabilities. With the advance of speech recognition and speech synthesis technologies, their complementary use becomes attractive for mobile devices in order to implement real multimodal user interaction. However, current systems and formats do not sufficiently integrate advanced multimodal interactions. We introduce an advanced generic multimodal interaction and rendering system (MIRS) dedicated for mobile devices. MIRS incorporates efficient processing of XML specification languages for limited, mobile devices and comes with the XML-based dialog and interface specification language (DISL). DISL can be considered as an UIML subset, which is enhanced by the means of state-oriented dialog specifications. The dialog specification is based on ODSN (object oriented dialog specification notation), which has been introduced to define user interface control by means of interaction states with transition rules.}},
  author       = {{Müller, Wolfgang and Schäfer, Robbie and Bleul, Steffen}},
  booktitle    = {{Proceedings of HICCS-37}},
  isbn         = {{0-7695-2056-1}},
  keywords     = {{User interfaces, Speech recognition, Streaming media, Specification languages, Keyboards, Speech synthesis, Rendering (computer graphics), Ambient intelligence, Humans, Displays}},
  location     = {{Waikoloa, HI, USA}},
  title        = {{{Interactive Multimodal User Interfaces for Mobile Devices}}},
  doi          = {{10.1109/HICSS.2004.1265674}},
  year         = {{2004}},
}

@article{11778,
  abstract     = {{In this paper, it is shown that a correlation criterion is the appropriate criterion for bottom-up clustering to obtain broad phonetic class regression trees for maximum likelihood linear regression (MLLR)-based speaker adaptation. The correlation structure among speech units is estimated on the speaker-independent training data. In adaptation experiments the tree outperformed a regression tree obtained from clustering according to closeness in acoustic space and achieved results comparable with those of a manually designed broad phonetic class tree}},
  author       = {{Haeb-Umbach, Reinhold}},
  journal      = {{IEEE Transactions on Speech and Audio Processing}},
  keywords     = {{acoustic space, adaptation experiments, automatic generation, bottom-up clustering, broad phonetic class regression trees, correlation criterion, correlation methods, maximum likelihood estimation, maximum likelihood linear regression based speaker adaptation, MLLR adaptation, pattern clustering, phonetic regression class trees, speaker-independent training data, speech recognition, speech units, statistical analysis, trees (mathematics)}},
  number       = {{3}},
  pages        = {{299--302}},
  title        = {{{Automatic generation of phonetic regression class trees for MLLR adaptation}}},
  doi          = {{10.1109/89.906003}},
  volume       = {{9}},
  year         = {{2001}},
}

@misc{2433,
  author       = {{Plessl, Christian and Maurer, Simon}},
  keywords     = {{co-design, speech processing}},
  publisher    = {{Computer Engineering and Networks Lab, ETH Zurich, Switzerland}},
  title        = {{{Hardware/Software Codesign in Speech Compression Applications}}},
  year         = {{2000}},
}

@inproceedings{11869,
  abstract     = {{Amongst several data driven approaches for designing filters for the time sequence of spectral parameters, the linear discriminant analysis (LDA) based method has been proposed for automatic speech recognition. Here we apply LDA-based filter design to cepstral features, which better match the inherent assumption of this method that feature vector components are uncorrelated. Extensive recognition experiments have been conducted both on the standard TIMIT phone recognition task and on a proprietary 130-words command word task under various adverse environmental conditions, including reverberant data with real-life room impulse responses and data processed by acoustic echo cancellation algorithms. Significant error rate reductions have been achieved when applying the novel long-range feature filters compared to standard approaches employing cepstral mean normalization and delta and delta-delta features, in particular when facing acoustic echo cancellation scenarios and room reverberation. For example, the phone accuracy on reverberated TIMIT data could be increased from 50.7\% to 56.0\%}},
  author       = {{Lieb, M. and Haeb-Umbach, Reinhold}},
  booktitle    = {{IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2000)}},
  keywords     = {{acoustic echo cancellation algorithms, adverse environmental conditions, automatic speech recognition, cepstral analysis, cepstral features, cepstral mean normalization, command word task, delta-delta features, delta features, echo suppression, error rate reductions, feature vector components, FIR filters, LDA derived cepstral trajectory filters, linear discriminant analysis, long-range feature filters, phone accuracy, real-life room impulse responses, reverberant data, spectral parameters, speech recognition, standard TIMIT phone recognition task}},
  pages        = {{II1105--II1108 vol.2}},
  title        = {{{LDA derived cepstral trajectory filters in adverse environmental conditions}}},
  doi          = {{10.1109/ICASSP.2000.859157}},
  volume       = {{2}},
  year         = {{2000}},
}

