@inproceedings{11842,
  abstract     = {{In this paper we present some experiments that have been performed while developing language models for the PHILIPS Broadcast News system. Three main issues will be discussed: construction of phrases, adaptation of remote corpora to this task, and the combination of the different models. Also, perplexities on the 1997 evaluation data are reported.}},
  author       = {{Klakow, Dietrich and Aubert, Xavier L. and Haeb-Umbach, Reinhold and Beyerlein, Peter and Ullrich, Meinhard and Wendemuth, Andreas and Wilcox, Patricia}},
  booktitle    = {{DARPA Broadcast News Transcription and Understanding Workshop, Landsdowne}},
  title        = {{{Language-Model Investigations related to Broadcast News}}},
  year         = {{1998}},
}

@inproceedings{11936,
  abstract     = {{Although speaker normalization is attempted in very different manners, vocal tract normalization (VTN) and speaker adaptive training (SAT) share many common properties. We show that both lead to more compact representations of the phonetically relevant variations of the training data and that both achieve improved error rate performance only if a complementary normalization or adaptation operation is conducted on the test data. Algorithms for fast test speaker enrollment are presented for both normalization methods: in the framework of SAT, a pre-transformation step is proposed, which alone, i.e. without subsequent unsupervised MLLR adaption, reduces the error rate by almost 10% on the WSJ 5k test sets. For VTN, the use of a Gaussian mixture model makes obsolete a first recognition pass to obtain a preliminary transcription of the test utterance at hardly and loss in performance.}},
  author       = {{Welling, L. and Haeb-Umbach, Reinhold and Aubert, X. and Haberland, N.}},
  booktitle    = {{ICASSP 1998, Seattle}},
  title        = {{{A Study on Speaker Normalization Using Vocal Tract Normalization and Speaker Adaptive Training}}},
  year         = {{1998}},
}

@inproceedings{11750,
  abstract     = {{Addresses the problem of online, writer-independent, unconstrained handwriting recognition. Based on hidden Markov models (HMM), which are successfully employed in speech recognition tasks, we focus on representations which address scalability, recognition performance and compactness. 'Delayed' features are introduced which integrate more global, handwriting specific knowledge into the HMM representation. These features lead to larger error-rate reduction than 'delta' features which are known from speech recognition and even require fewer additional components. Scalability is addressed with a size-independent representation. Compactness is achieved with linear discriminant analysis. The representations are discussed and the results for a mixed-style word recognition task with vocabularies of 200 (up to 99% correct words) and 20000 words (up to 88.8% correct words) are given.}},
  author       = {{Dolfing, J.G.A. and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP, Munich}},
  title        = {{{Signal Representations for Hidden Markov Model Based On-Line Handwriting Recognition}}},
  year         = {{1997}},
}

@article{11766,
  abstract     = {{This paper reports the design of a command-based speech interface for an answering machine or a voice mail system. Automatic speech recognition was integrated in order to facilitate the remote control and the retrieval of voice messages from any telephone in a speech-only dialogue. The design goal was that consumers would perceive the speech interface as a benefit compared with the common touch-tone interface. In this paper we will first describe the speech technology underlying the system. Then it will be shown how, based on this technology, the user interface was designed in a top-down approach. We started with the development of a concept and tested it by means of a Wizard-of-Oz simulation. After refining the concept in parallel design, it was implemented in a high-fidelity prototype. By means of qualitative user testing the design was improved in three iteration steps. The achievement of the design goal was finally verified with user tests in two countries.}},
  author       = {{Gamm, Stephan and Haeb-Umbach, Reinhold and Langmann, Detlev}},
  journal      = {{Speech Communication}},
  title        = {{{The development of a command-based speech interface for a telephone answering machine}}},
  year         = {{1997}},
}

@inproceedings{11781,
  abstract     = {{The increased popularity of mobile telephony introduces both challenges and opportunitites for automatic speech recognition. ASR offers ways to simplify the use of mobile phones, notably in hands- and eyes-busy situations. However, the acoustic environment can be severely degraded and the wireless network may add additional distortions to the speech signal. This paper gives an overview of the sources of degradation and attempts to robust speech recognition for mobile communications. Emphasis is placed on approaches which are suitable for implementation in mobile terminals. Two example applications are described which illustrate the robustness issues and design considerations typical of low-cost noisy speech recognition: voice-dialling in a GSM phone and hands-free digit recognition in the car.}},
  author       = {{Haeb-Umbach, Reinhold}},
  booktitle    = {{Eurospeech}},
  title        = {{{Robust Speech Recognition for Wireless Networks and Mobile Telephony}}},
  year         = {{1997}},
}

@inproceedings{11819,
  abstract     = {{The SpeechDat project aims to produce speech databases for all official languages of the European Union and some major dialectal variants and minority languages resulting in 28 speech databases. They will be recorded over fixed and mobile telephone networks. This will provide a realistic basis for training and assessment of both isolated and continuous-speech utterances, employing whole-word or subword approaches, and thus can be used for developing voice driven teleservices including speaker verification. The specification of the databases has been developed jointly, and is essentially the same for each language to facilitate dissemination and use. There will be a controlled variation among the speakers concerning sex, age, dialect, environment of call, etc. The validation of all databases will be carried out centrally. The SpeechDat databases will be transferred to ELRA for distribution. The next databases to be recorded will cover East European languages.}},
  author       = {{Hoege, H. and Tropf, H. S. and Winsky, R. and van den Heuvel, H. and Haeb-Umbach, Reinhold and Choukri, K.}},
  booktitle    = {{ICASSP, Munich}},
  title        = {{{European Speech Databases for Telephone Applications}}},
  year         = {{1997}},
}

@inproceedings{11852,
  abstract     = {{This paper describes speaker-independent speech recognition experiments concerning acoustic front end processing on a speech database that was recorded in 3 different cars. We investigate different feature analysis approaches (mel-filter bank, mel-cepstrum, perceptually linear predictive coding) and present results with noise compensation techniques based on spectral subtraction. Although the methods employed lead to considerable error rate reduction the error analysis shows that low signal-to-noise ratios are still a problem}},
  author       = {{Langmann, Detlev and Fischer, Alexander and Wuppermann, Friedhelm and Haeb-Umbach, Reinhold and Eisele, Thomas}},
  booktitle    = {{Eurospeech}},
  title        = {{{Acoustic Front Ends for Speaker-Independent Digit Recognition in Car Environments}}},
  year         = {{1997}},
}

@inproceedings{11855,
  author       = {{Langmann, Detlev and Wuppermann, Friedhelm and Haeb-Umbach, Reinhold and Fischer, A. and Eisele, Thomas}},
  booktitle    = {{Aachener Kolloquium on Signal Theory}},
  title        = {{{Investigation of Acoustic Front Ends for Speaker-Independent Speech Recognition in the Car}}},
  year         = {{1997}},
}

@inproceedings{11761,
  abstract     = {{Although widely used, there are still open questions concerning which properties of linear discriminant analysis (LDA) account for its success in many speech recognition systems. In order to gain more insight into the nature of the transformation we compare LDA with mel-cepstral feature vectors with respect to the following criteria: decorrelation and ordering property; invariance under linear transforms; automatic learning of dynamical features; and data dependence of the transformation.}},
  author       = {{Eisele, Thomas and Haeb-Umbach, Reinhold and Langmann, Detlev}},
  booktitle    = {{ICSLP , Philadelphia}},
  title        = {{{A Comparative Study of Linear Feature Transformation Techniques for Automatic Speech Recognition}}},
  year         = {{1996}},
}

@inproceedings{11767,
  abstract     = {{This paper tells the story of the design of a command-based speech interface for a voice mail system. Speech recognition was integrated in the voice mail system in order to allow the remote interrogation of messages in a speech-only dialogue. Our design goal was that consumers would perceive voice control as a clear benefit versus touch-tone control. It is shown how the speech interface was designed in a top-down approach. We started with a concept development and tested it by means of a Wizard-of-Oz simulation. After refining the concept in parallel design, the design was implemented in a high-fidelity prototype. By means of qualitative user testing it was improved in three iteration steps. We verified the achievement of our design goal with tests in two countries}},
  author       = {{Gamm, Stephan and Haeb-Umbach, Reinhold and Langmann, Detlev}},
  booktitle    = {{IEEE Workshop on Interactive Voice Technology for Telecommunications Applications}},
  title        = {{{Findings with the Design of a Command-Based Speech Interface for a Voice Mail System}}},
  year         = {{1996}},
}

@inproceedings{11853,
  abstract     = {{The paper describes the design, collection and postprocessing of the French SpeechDat corpus FRESCO. Being a database of approximately 35000 utterances recorded from 1000 callers over the terrestrial telephone network in France, it comprises immediately usable and relevant speech for the initial training and assessment of speaker independent phoneme model or word model based speech recognizers, as they are employed in automated telephone services. FRESCO is one of the 1000 speaker telephone speech databases produced as "case studies" within the European project SpeechDat(M).}},
  author       = {{Langmann, Detlev and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICSLP, Philadelphia}},
  title        = {{{FRESCO: The French Telephone Speech Data Collection - Part of the European SpeechDat(M) Project}}},
  year         = {{1996}},
}

@inproceedings{11854,
  author       = {{Langmann, Detlev and Haeb-Umbach, Reinhold and Eisele, Thomas}},
  booktitle    = {{ITG Fachtagung Sprachkommunikation, Frankfurt}},
  title        = {{{Robust Rejection Modeling for a Small-Vocabulary Application}}},
  year         = {{1996}},
}

@inproceedings{11757,
  abstract     = {{Clustering techniques have been integrated at different levels into the training procedure of a continuous-density hidden Markov model (HMM) speech recognizer. These clustering techniques can be used in two ways. First acoustically similar states are tied together. It will help to reduce the number of parameters but also allow to train otherwise rarely seen states together with more robust ones (state-tying). Secondly densities are clustered across states, this reduces the number of densities while at the same time keeping the best performances of our recognizer (density-clustering). We have applied these techniques both to word-based small-vocabulary and phoneme-based large-vocabulary recognition tasks. On the WSJ task, we could achieve a reduction of the word error rate by 7%. On the TI/NIST-connected digit task, the number of parameters was reduced by a factor 2-3 while keeping the same string error rate.}},
  author       = {{Dugast, Christian and Beyerlein, Peter and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP, Detroit}},
  title        = {{{Application of Clustering Techniques to Mixture Density Modelling for Continuous-Speech Recognition}}},
  year         = {{1995}},
}

@article{11764,
  abstract     = {{Today speech recognition of a small vocabulary can be realized so cost-effectively that the technology can penetrate into consumer electronics. But, as first applications that failed on the market show, it is by no means obvious how to incorporate voice control in a user interface. This paper addresses the issue of how to design a voice control so that the user perceives it as a benefit. User interface guidelines that are adapted or specific to voice control are presented. Then the process of designing a voice control in the user-centred approach is described. By means of two examples, the car stereo and telephone answering machine, it is shown how this is turned into practice.}},
  author       = {{Gamm, Stephan and Haeb-Umbach, Reinhold}},
  journal      = {{Philips Journal of Research}},
  title        = {{{User interface design of voice controlled consumer electronics}}},
  year         = {{1995}},
}

@inproceedings{11765,
  author       = {{Gamm, Stephan and Haeb-Umbach, Reinhold}},
  booktitle    = {{Eurospeech, Madrid}},
  title        = {{{Human Factors of a Voice-Controlled Car Stereo}}},
  year         = {{1995}},
}

@inproceedings{11768,
  author       = {{Gamm, Stephan and Haeb-Umbach, Reinhold and Langmann, Det}},
  booktitle    = {{International Symposium on Human Factors in Telecommunications, Melbourne}},
  title        = {{{The Usability Engineering of a Voice-Controlled Answering Machine}}},
  year         = {{1995}},
}

@article{11786,
  abstract     = {{Recognition accuracy has been the primary objective of most speech recognition research, and impressive results have been obtained, e.g. less than 0.3% word error rate on a speaker-independent digit recognition task. When it comes to real-world applications, robustness and real-time response might be more important issues. For the first requirement we review some of the work on robustness and discuss one specific technique, spectral normalization, in more detail. The requirement of real-time response has to be considered in the light of the limited hardware resources in voice control applications, which are due to the tight cost constraints. In this paper we discuss in detail one specific means to reduce the processing and memory demands: a clustering technique applied at various levels within the acoustic modelling.}},
  author       = {{Haeb-Umbach, Reinhold and Beyerlein, Peter and Geller, Dieter}},
  journal      = {{Philips Journal of Research}},
  title        = {{{Speech recognition algorithms for voice control interfaces}}},
  year         = {{1995}},
}

@inproceedings{11787,
  abstract     = {{We address the problem of automatically finding an acoustic representation (i.e. a transcription) of unknown words as a sequence of subword units, given a few sample utterances of the unknown words, and an inventory of speaker-independent subword units. The problem arises if a user wants to add his own vocabulary to a speaker-independent recognition system simply by speaking the words a few times. Two methods are investigated which are both based on a maximum-likelihood formulation of the problem. The experimental results show that both automatic transcription methods provide a good estimate of the acoustic models of unknown words. The recognition error rates obtained with such models in a speaker-independent recognition task are clearly better than those resulting from separate whole-word models. They are comparable with the performance of transcriptions drawn from a dictionary.}},
  author       = {{Haeb-Umbach, Reinhold and Beyerlein, P. and Thelen, E.}},
  booktitle    = {{ICASSP, Detroit}},
  title        = {{{Automatic Transcription of Unknown Words in a Speech Recognition System}}},
  year         = {{1995}},
}

@article{11905,
  abstract     = {{This paper gives an overview of the Philips Research system for continuous-speech recognition. The recognition architecture is based on an integrated statistical approach. The system has been successfully applied to various tasks in American English and German, ranging from small vocabulary tasks to very large vocabulary tasks and from recognition only to speech understanding. Here, we concentrate on phoneme-based continuous-speech recognition for large vocabulary recognition as used for dictation, which covers a significant part of our research work on speech recognition. We describe this task and report on experimental results. In order to allow a comparison with the performance of other systems, a section with an evaluation on the standard North American Business news (NAB2) task (dictation of American English newspaper text) is supplied.}},
  author       = {{Steinbiss, Volker and Ney, Hermann J. and Aubert, Xavier L. and Besling, Stefan and Dugast, Christian and Essen, Ute and Geller, Dieter and Haeb-Umbach, Reinhold and Kneser, Reinhard and Meier, Hans Günter and Oerder, Martin and Tran, Bach Hiep}},
  journal      = {{Philips Journal of Research}},
  title        = {{{The Philips Research system for continuous-speech dictation}}},
  year         = {{1995}},
}

@article{11948,
  abstract     = {{This paper gives an overview of the Philips research system for phoneme-based, large-vocabulary, continuousspeech recognition. The system has been successfully applied to various tasks in the German and (American) English languages, ranging from small vocabulary tasks to very large vocabulary tasks. Here, we concentrate on continuousspeech recognition for dictation in real applications, the dictation of legal reports and radiology reports in German. We describe this task and report on experimental results. We also describe a commercial PC-based dictation system which includes a PC implementation of our scientific recognition prototype. In order to allow for a comparison with the performance of other systems, a section with an evaluation on the standard Wall Street Journal task (dictation of American English newspaper text) is supplied. The recognition architecture is based on an integrated statistical approach. We describe the characteristic features of the system as opposed to other systems: 1. the Viterbi criterion is consistently applied both in training and testing; 2. continuous mixture densities are used without tying or smoothing; 3. time-synchronous beam search in connection with a phoneme look-ahead is applied to a tree-organized lexicon.}},
  author       = {{Steinbiss, Volker and Ney, Hermann J. and Essen, Ute and Tran, Bach Hiep and Aubert, Xavier L. and Dugast, Christian and Kneser, Reinhard and Meier, Hans Günter and Oerder, Martin and Haeb-Umbach, Reinhold and Geller, Dieter and Hoellerbauer, W. and Bartosik, H.}},
  journal      = {{Speech Communication}},
  title        = {{{Continuous speech dictation - From theory to practice}}},
  year         = {{1995}},
}

