@article{35602,
  abstract     = {{Continuous Speech Separation (CSS) has been proposed to address speech overlaps during the analysis of realistic meeting-like conversations by eliminating any overlaps before further processing.
CSS separates a recording of arbitrarily many speakers into a small number of overlap-free output channels, where each output channel may contain speech of multiple speakers.
This is often done by applying a conventional separation model trained with Utterance-level Permutation Invariant Training (uPIT), which exclusively maps a speaker to an output channel, in sliding window approach called stitching.
Recently, we introduced an alternative training scheme called Graph-PIT that teaches the separation network to directly produce output streams in the required format without stitching.
It can handle an arbitrary number of speakers as long as never more of them overlap at the same time than the separator has output channels.
In this contribution, we further investigate the Graph-PIT training scheme.
We show in extended experiments that models trained with Graph-PIT also work in challenging reverberant conditions.
Models trained in this way are able to perform segment-less CSS, i.e., without stitching, and achieve comparable and often better separation quality than the conventional CSS with uPIT and stitching.
We simplify the training schedule for Graph-PIT with the recently proposed Source Aggregated Signal-to-Distortion Ratio (SA-SDR) loss.
It eliminates unfavorable properties of the previously used A-SDR loss and thus enables training with Graph-PIT from scratch.
Graph-PIT training relaxes the constraints w.r.t. the allowed numbers of speakers and speaking patterns which allows using a larger variety of training data.
Furthermore, we introduce novel signal-level evaluation metrics for meeting scenarios, namely the source-aggregated scale- and convolution-invariant Signal-to-Distortion Ratio (SA-SI-SDR and SA-CI-SDR), which are generalizations of the commonly used SDR-based metrics for the CSS case.}},
  author       = {{von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  issn         = {{2329-9290}},
  journal      = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{Continuous Speech Separation, Source Separation, Graph-PIT, Dynamic Programming, Permutation Invariant Training}},
  pages        = {{576--589}},
  publisher    = {{Institute of Electrical and Electronics Engineers (IEEE)}},
  title        = {{{Segment-Less Continuous Speech Separation of Meetings: Training and Evaluation Criteria}}},
  doi          = {{10.1109/taslp.2022.3228629}},
  volume       = {{31}},
  year         = {{2023}},
}

@article{49636,
  abstract     = {{<jats:title>Abstract</jats:title><jats:sec>
              <jats:title>Purpose</jats:title>
              <jats:p>Wearables serve to quantify the on-court activity in intermittent sports such as field hockey (FH). Based on objective data, benchmarks can be determined to tailor training intensity and volume. Next to average and accumulated values, the most intense periods (MIPs) during competitive FH matches are of special interest, since these quantify the peak intensities players experience throughout the intermittent matches. The aim of this study was to retrospectively compare peak intensities between training and competition sessions in a male FH team competing in the first german division.</jats:p>
            </jats:sec><jats:sec>
              <jats:title>Methods</jats:title>
              <jats:p>Throughout an 8-week in-season period, 372 individual activity datasets (144 datasets from competitive sessions) were recorded using the Polar Team Pro sensor (Kempele, Finland). MIPs were calculated applying a rolling window approach with predefined window length (1–5 min) and calculated for Total distance, High-Intensity-Running distance (&gt; 16 km/h), Sprinting distance (&gt; 20 km/h) and Acceleration load. Significant differences between training and competition MIPs were analysed through non-parametric statistical tests (<jats:italic>P</jats:italic> &lt; 0.05).</jats:p>
            </jats:sec><jats:sec>
              <jats:title>Results</jats:title>
              <jats:p>Analyses revealed higher MIPs during competition for all considered outcomes (<jats:italic>P</jats:italic> &lt; 0.001). Effect size estimation revealed strongest effects for sprinting distance (<jats:italic>d</jats:italic> = 1.89 to <jats:italic>d</jats:italic> = 1.22) and lowest effect sizes for acceleration load (<jats:italic>d</jats:italic> = 0.92 to <jats:italic>d</jats:italic> = 0.49).</jats:p>
            </jats:sec><jats:sec>
              <jats:title>Conclusion</jats:title>
              <jats:p>The present findings demonstrate that peak intensities during training do not reach those experienced during competitive sessions in a male FH team. Training routines such as manipulations of court-dimensions and team sizes might contribute to this discrepancy. Coaches should compare training and competition intensities to recalibrate training routines to optimize athletes’ preparation for competition.</jats:p>
            </jats:sec>}},
  author       = {{Büchel, Daniel and Döring, Michael and Baumeister, Jochen}},
  issn         = {{2096-6709}},
  journal      = {{Journal of Science in Sport and Exercise}},
  keywords     = {{Nutrition and Dietetics, Rehabilitation, Orthopedics and Sports Medicine, Physical Therapy, Sports Therapy and Rehabilitation, Physiology}},
  publisher    = {{Springer Science and Business Media LLC}},
  title        = {{{A Comparison of the Most Intense Periods (MIPs) During Competitive Matches and Training Over an 8-Week Period in a Male Elite Field Hockey Team}}},
  doi          = {{10.1007/s42978-023-00261-w}},
  year         = {{2023}},
}

@article{45159,
  author       = {{Sherman, David A. and Baumeister, Jochen and Stock, Matt S. and Murray, Amanda M. and Bazett-Jones, David M. and Norte, Grant E.}},
  issn         = {{1388-2457}},
  journal      = {{Clinical Neurophysiology}},
  keywords     = {{Physiology (medical), Neurology (clinical), Neurology, Sensory Systems}},
  pages        = {{88--99}},
  publisher    = {{Elsevier BV}},
  title        = {{{Brain activation and single-limb balance following anterior cruciate ligament reconstruction}}},
  doi          = {{10.1016/j.clinph.2023.02.175}},
  volume       = {{149}},
  year         = {{2023}},
}

@inproceedings{48275,
  abstract     = {{MeetEval is an open-source toolkit to evaluate  all kinds of meeting transcription systems.
It provides a unified interface for the computation of commonly used Word Error Rates (WERs), specifically cpWER, ORC WER and MIMO WER along other WER definitions.
We extend the cpWER computation by a temporal constraint to ensure that only words are identified as correct when the temporal alignment is plausible.
This leads to a better quality of the matching of the hypothesis string to the reference string that more closely resembles the actual transcription quality, and a system is penalized if it provides poor time annotations.
Since word-level timing information is often not available, we present a way to approximate exact word-level timings from segment-level timings (e.g., a sentence) and show that the approximation leads to a similar WER as a matching with exact word-level annotations.
At the same time, the time constraint leads to a speedup of the matching algorithm, which outweighs the additional overhead caused by processing the time stamps.}},
  author       = {{von Neumann, Thilo and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{Proc. CHiME 2023 Workshop on Speech Processing in Everyday Environments}},
  keywords     = {{Speech Recognition, Word Error Rate, Meeting Transcription}},
  location     = {{Dublin}},
  title        = {{{MeetEval: A Toolkit for Computation of Word Error Rates for Meeting Transcription Systems}}},
  year         = {{2023}},
}

@inproceedings{26770,
  abstract     = {{Automatic transcription of meetings requires handling of overlapped speech, which calls for continuous speech separation (CSS) systems. The uPIT criterion was proposed for utterance-level separation with neural networks and introduces the constraint that the total number of speakers must not exceed the number of output channels. When processing meeting-like data in a segment-wise manner, i.e., by separating overlapping segments independently and stitching adjacent segments to continuous output streams, this constraint has to be fulfilled for any segment. In this contribution, we show that this constraint can be significantly relaxed. We propose a novel graph-based PIT criterion, which casts the assignment of utterances to output channels in a graph coloring problem. It only requires that the number of concurrently active speakers must not exceed the number of output channels. As a consequence, the system can process an arbitrary number of speakers and arbitrarily long segments and thus can handle more diverse scenarios.
Further, the stitching algorithm for obtaining a consistent output order in neighboring segments is of less importance and can even be eliminated completely, not the least reducing the computational effort. Experiments on meeting-style WSJ data show improvements in recognition performance over using the uPIT criterion. }},
  author       = {{von Neumann, Thilo and Kinoshita, Keisuke and Boeddeker, Christoph and Delcroix, Marc and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2021}},
  keywords     = {{Continuous speech separation, automatic speech recognition, overlapped speech, permutation invariant training}},
  title        = {{{Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers}}},
  doi          = {{10.21437/interspeech.2021-1177}},
  year         = {{2021}},
}

@article{57971,
  abstract     = {{Repetitive TMS (rTMS) with a frequency of 5-10~Hz is widely used for language mapping. However, it may be accompanied by discomfort and is limited in the number and reliability of evoked language errors. We, here, systematically tested the influence of different stimulation frequencies (i.e., 10, 30, and 50 Hz) on tolerability, number, reliability, and cortical distribution of language errors aiming at improved language mapping. 15 right-handed, healthy subjects (m~=~8, median age: 29 yrs) were investigated in two sessions, separated by 2-5 days. In each session, 10, 30, and 50 Hz rTMS were applied over the left hemisphere in a randomized order during a picture naming task. Overall, 30 Hz rTMS evoked significantly more errors (20 $\pm$ 12{%}) compared to 50 Hz (12 $\pm$ 8{%}; p {\textless}.01), whereas error rates were comparable between 30/50 and 10~Hz (18 $\pm$ 11{%}). Across all conditions, a significantly higher error rate was found in Session 1 (19 $\pm$ 13{%}) compared to Session 2 (13 $\pm$ 7{%}, p {\textless}.05). The error rate was poorly reliable between sessions for 10 (intraclass correlation coefficient, ICC~=~.315) and 30 Hz (ICC~=~.427), whereas 50 Hz showed a moderate reliability (ICC~=~.597). Spatial reliability of language errors was low to moderate with a tendency toward increased reliability for higher frequencies, for example, within frontal regions. Compared to 10~Hz, both, 30 and 50 Hz were rated as less painful. Taken together, our data favor the use of rTMS-protocols employing higher frequencies for evoking language errors reliably and with reduced discomfort, depending on the region of interest.}},
  author       = {{Nettekoven, Charlotte and Pieczewski, Julia and Neuschmelting, Volker and Jonas, Kristina and Goldbrunner, Roland and Grefkes, Christian and Weiss Lucas, Carolin}},
  journal      = {{Human brain mapping}},
  keywords     = {{Adult, Brain Mapping, Cerebral Cortex/diagnostic imaging/physiology, Female, Humans, Magnetic Resonance Imaging, Male, Pattern Recognition, Psycholinguistics, Reproducibility of Results, Speech/physiology, Transcranial Magnetic Stimulation, Visual/physiology, Young Adult}},
  number       = {{16}},
  pages        = {{5309–5321}},
  title        = {{{Improving the efficacy and reliability of rTMS language mapping by increasing the stimulation frequency}}},
  doi          = {{10.1002/hbm.25619}},
  volume       = {{42}},
  year         = {{2021}},
}

@inproceedings{20504,
  abstract     = {{In recent years time domain speech separation has excelled over frequency domain separation in single channel scenarios and noise-free environments. In this paper we dissect the gains of the time-domain audio separation network (TasNet) approach by gradually replacing components of an utterance-level permutation invariant training (u-PIT) based separation system in the frequency domain until the TasNet system is reached, thus blending components of frequency domain approaches with those of time domain approaches. Some of the intermediate variants achieve comparable signal-to-distortion ratio (SDR) gains to TasNet, but retain the advantage of frequency domain processing: compatibility with classic signal processing tools such as frequency-domain beamforming and the human interpretability of the masks. Furthermore, we show that the scale invariant signal-to-distortion ratio (si-SDR) criterion used as loss function in TasNet is related to a logarithmic mean square error criterion and that it is this criterion which contributes most reliable to the performance advantage of TasNet. Finally, we critically assess which gains in a noise-free single channel environment generalize to more realistic reverberant conditions.}},
  author       = {{Heitkaemper, Jens and Jakobeit, Darius and Boeddeker, Christoph and Drude, Lukas and Haeb-Umbach, Reinhold}},
  booktitle    = {{ICASSP 2020 Virtual Barcelona Spain}},
  keywords     = {{voice activity detection, speech activity detection, neural network, statistical speech processing}},
  title        = {{{Demystifying TasNet: A Dissecting Approach}}},
  year         = {{2020}},
}

@article{33389,
  abstract     = {{<jats:title>Abstract</jats:title><jats:sec>
<jats:title>Purpose</jats:title>
<jats:p>Space flight and bed rest (BR) lead to a rapid decline in exercise capacity. Whey protein plus potassium bicarbonate diet-supplementation (NUTR) could attenuate this effect by improving oxidative metabolism. We evaluated the impact of 21-day BR and NUTR on fatigue resistance of plantar flexor muscles (PF) during repeated shortening contractions, and whether any change was related to altered energy metabolism and muscle oxygenation.</jats:p>
</jats:sec><jats:sec>
<jats:title>Methods</jats:title>
<jats:p>Ten healthy men received a standardized isocaloric diet with (<jats:italic>n</jats:italic> = 5) or without (<jats:italic>n</jats:italic> = 5) NUTR. Eight bouts of 24 concentric plantar flexions (30 s each bout) with 20 s rest between bouts were employed. PF muscle size was assessed by means of peripheral quantitative computed tomography. PF muscle volume was assessed with magnetic resonance imaging. PF muscle force, contraction velocity, power and surface electromyogram signals were recorded during each contraction, as well as energy metabolism (<jats:sup>31</jats:sup>P nuclear magnetic resonance spectroscopy) and oxygenation (near-infrared spectroscopy). Cardiopulmonary parameters were measured during an incremental cycle exercise test.</jats:p>
</jats:sec><jats:sec>
<jats:title>Results</jats:title>
<jats:p>BR caused 10–15% loss of PF volume that was partly recovered 3 days after re-ambulation, as a consequence of fluid redistribution. Unexpectedly, PF fatigue resistance was not affected by BR or NUTR. BR induced a shift in muscle metabolism toward glycolysis and some signs of impaired muscle oxygen extraction. NUTR did not attenuate the BR-induced-shift in energy metabolism.</jats:p>
</jats:sec><jats:sec>
<jats:title>Conclusions</jats:title>
<jats:p>Twenty-one days’ BR did not impair PF fatigue resistance, but the shift to glycolytic metabolism and indications of impaired oxygen extraction may be early signs of developing reduced muscle fatigue resistance.</jats:p>
</jats:sec>}},
  author       = {{Bosutti, Alessandra and Mulder, Edwin and Zange, Jochen and Bühlmeier, Judith and Ganse, Bergita and Degens, Hans}},
  issn         = {{1439-6319}},
  journal      = {{European Journal of Applied Physiology}},
  keywords     = {{Physiology (medical), Public Health, Environmental and Occupational Health, Orthopedics and Sports Medicine, General Medicine, Public Health, Environmental and Occupational Health, Physiology}},
  number       = {{5}},
  pages        = {{969--983}},
  publisher    = {{Springer Science and Business Media LLC}},
  title        = {{{Effects of 21 days of bed rest and whey protein supplementation on plantar flexor muscle fatigue resistance during repeated shortening contractions}}},
  doi          = {{10.1007/s00421-020-04333-5}},
  volume       = {{120}},
  year         = {{2020}},
}

@inproceedings{20505,
  abstract     = {{Speech activity detection (SAD), which often rests on the fact that the noise is "more'' stationary than speech, is particularly challenging in non-stationary environments, because the time variance of the acoustic scene makes it difficult to discriminate  speech from noise. We propose two approaches to SAD, where one is based on statistical signal processing, while the other utilizes neural networks. The former employs sophisticated signal processing to track the noise and speech energies and is meant to support the case for a resource efficient, unsupervised signal processing approach.
The latter introduces a recurrent network layer that operates on short segments of the input speech to do temporal smoothing in the presence of non-stationary noise. The systems are tested on the Fearless Steps challenge database, which consists of the transmission data from the Apollo-11 space mission.
The statistical SAD  achieves comparable detection performance to earlier proposed neural network based SADs, while the neural network based approach leads to a decision cost function of 1.07% on the evaluation set of the 2020 Fearless Steps Challenge, which sets a new state of the art.}},
  author       = {{Heitkaemper, Jens and Schmalenstroeer, Joerg and Haeb-Umbach, Reinhold}},
  booktitle    = {{INTERSPEECH 2020 Virtual Shanghai China}},
  keywords     = {{voice activity detection, speech activity detection, neural network, statistical speech processing}},
  title        = {{{Statistical and Neural Network Based Speech Activity Detection in Non-Stationary Acoustic Environments}}},
  year         = {{2020}},
}

@inproceedings{17557,
  abstract     = {{Previous work by [1] studied gesture-speech interaction in adults. [1] focussed on temporal and semantic coordination of gesture and speech and found that while adult speech is mostly coordinated (or redundant) with gestures, semantic coordination increases the temporal synchrony. These observations do not necessarily hold for children (in particular with respect to iconic gestures, see [2]), where the speech and gesture systems are still under development. We studied the semantic and temporal coordination of speech and gesture in 4-year old children using a corpus of 40 children producing action descriptions in task oriented dialogues. In particular, we examined what kinds of information are transmitted verbally vs. non-verbally and how they are related. To account for this, we extended the semantic features (SFs) developed in [3] for object descriptions in order to include the semantics of actions. We coded the SFs on the children’s speech and gestures separately using video data. In our presentation, we will focus on the quantitative distribution of SFs across gesture and speech. Our results indicate that speech and gestures of 4-year olds are less integrated than those of the adults, although there is a large variability among the children. We will discuss the results with respect to the cognitive processes (e.g., visual memory, language) underlying children’s abilities at this stage of development. Our work paves the way for the cognitive architecture of speech-gesture interaction in preschoolers which to our knowledge is missing so far. }},
  author       = {{Abramov, Olga and Kopp, Stefan and Nemeth, Anne and Kern, Friederike and Mertens, Ulrich and Rohlfing, Katharina}},
  booktitle    = {{KOGWIS2018: Computational Approaches to Cognitive Science}},
  keywords     = {{Speech-gesture integration, semantic features}},
  title        = {{{Towards a Computational Model of Child Gesture-Speech Production}}},
  year         = {{2018}},
}

@inproceedings{17179,
  abstract     = {{Previous work by [1] studied gesture-speech interaction in adults. [1] focussed on temporal and semantic coordination of gesture and speech and found that while adult speech is mostly coordinated (or redundant) with gestures, semantic coordination increases the temporal synchrony. These observations do not necessarily hold for children (in particular with respect to iconic gestures, see [2]), where the speech and gesture systems are still under development. We studied the semantic and temporal coordination of speech and gesture in 4-year old children using a corpus of 40 children producing action descriptions in task oriented dialogues. In particular, we examined what kinds of information are transmitted verbally vs. non-verbally and how they are related. To account for this, we extended the semantic features (SFs) developed in [3] for object descriptions in order to include the semantics of actions. We coded the SFs on the children’s speech and gestures separately using video data. In our presentation, we will focus on the quantitative distribution of SFs across gesture and speech. Our results indicate that speech and gestures of 4-year olds are less integrated than those of the adults, although there is a large variability among the children. We will discuss the results with respect to the cognitive processes (e.g., visual memory, language) underlying children’s abilities at this stage of development. Our work paves the way for the cognitive architecture of speech-gesture interaction in preschoolers which to our knowledge is missing so far. }},
  author       = {{Abramov, Olga and Kopp, Stefan and Nemeth, Anne and Kern, Friederike and Mertens, Ulrich and Rohlfing, Katharina}},
  booktitle    = {{KOGWIS2018: Computational Approaches to Cognitive Science}},
  keywords     = {{Speech-gesture integration, semantic features}},
  title        = {{{Towards a Computational Model of Child Gesture-Speech Production}}},
  year         = {{2018}},
}

@article{30116,
  author       = {{Gölz, Christian Johannes and Voelcker-Rehage, Claudia and Mora, Karin and Reuter, Eva-Maria and Godde, Ben and Dellnitz, Michael and Reinsberger, Claus and Vieluf, Solveig}},
  issn         = {{1664-042X}},
  journal      = {{Frontiers in Physiology}},
  keywords     = {{Physiology (medical), Physiology}},
  publisher    = {{Frontiers Media SA}},
  title        = {{{Improved Neural Control of Movements Manifests in Expertise-Related Differences in Force Output and Brain Network Dynamics}}},
  doi          = {{10.3389/fphys.2018.01540}},
  volume       = {{9}},
  year         = {{2018}},
}

@article{41968,
  author       = {{Scheer, Volker and Vieluf, Solveig and Cramer, Leoni and Jakobsmeyer, Rasmus and Heitkamp, Hans-Christian}},
  issn         = {{1664-042X}},
  journal      = {{Frontiers in Physiology}},
  keywords     = {{Physiology (medical), Physiology}},
  publisher    = {{Frontiers Media SA}},
  title        = {{{Changes in Running Economy During a 65-km Ultramarathon}}},
  doi          = {{10.3389/fphys.2018.01809}},
  volume       = {{9}},
  year         = {{2018}},
}

@article{33394,
  abstract     = {{<jats:p> The effectiveness of whey protein plus potassium bicarbonate-enriched diet (WP+KHCO<jats:sub>3</jats:sub>) in mitigating disuse-induced changes in muscle fiber oxidative capacity and capillarization was investigated in a 21-day crossover design bed rest study. Ten healthy men (31 ± 6 yr) once received WP+KHCO<jats:sub>3</jats:sub> and once received a standardized isocaloric diet. Muscle biopsies were taken 2 days before and during the 19th day of bed rest (BR) from the soleus (SOL) and vastus lateralis (VL) muscle. Whole-body aerobic power (V̇o<jats:sub>2 max</jats:sub>), muscle fatigue, and isometric strength of knee extensor and plantar flexor muscles were monitored. Muscle fiber types and capillaries were identified by immunohistochemistry. Fiber oxidative capacity was determined as the optical density (OD) at 660 nm of succinate dehydrogenase (SDH)-stained sections. The product of fiber cross-sectional area and SDH-OD (integrated SDH) indicated the maximal oxygen consumption of that fiber. The maximal oxygen consumption supported by a capillary was calculated as the integrated SDH in its supply area. BR reduced isometric strength of knee extensor muscles ( P &lt; 0.05), and the fiber oxidative capacity ( P &lt; 0.001) and V̇o<jats:sub>2 max</jats:sub> ( P = 0.042), but had no significant impact on muscle capillarization or fatigue resistance of thigh muscles. The maximal oxygen consumption supported by a capillary was reduced by 24% in SOL and 16% in VL ( P &lt; 0.001). WP+KHCO<jats:sub>3</jats:sub> attenuated the disuse-induced reduction in fiber oxidative capacity in both muscles ( P &lt; 0.01). In conclusion, following 19 days of bed rest, the decrement in fiber oxidative capacity is proportionally larger than the loss of capillaries. WP+KHCO<jats:sub>3</jats:sub> appears to attenuate disuse-induced reductions in fiber oxidative capacity. </jats:p>}},
  author       = {{Bosutti, Alessandra and Salanova, Michele and Blottner, Dieter and Bühlmeier, Judith and Mulder, Edwin and Rittweger, Jörn and Yap, Moi Hoon and Ganse, Bergita and Degens, Hans}},
  issn         = {{8750-7587}},
  journal      = {{Journal of Applied Physiology}},
  keywords     = {{Physiology (medical), Physiology}},
  number       = {{4}},
  pages        = {{838--848}},
  publisher    = {{American Physiological Society}},
  title        = {{{Whey protein with potassium bicarbonate supplement attenuates the reduction in muscle oxidative capacity during 19 days of bed rest}}},
  doi          = {{10.1152/japplphysiol.00936.2015}},
  volume       = {{121}},
  year         = {{2016}},
}

@inproceedings{11739,
  abstract     = {{Noise tracking is an important component of speech enhancement algorithms. Of the many noise trackers proposed, Minimum Statistics (MS) is a particularly popular one due to its simple parameterization and at the same time excellent performance. In this paper we propose to further reduce the number of MS parameters by giving an alternative derivation of an optimal smoothing constant. At the same time the noise tracking performance is improved as is demonstrated by experiments employing speech degraded by various noise types and at different SNR values.}},
  author       = {{Chinaev, Aleksej and Haeb-Umbach, Reinhold}},
  booktitle    = {{Interspeech 2015}},
  keywords     = {{speech enhancement, noise tracking, optimal smoothing}},
  pages        = {{1785--1789}},
  title        = {{{On Optimal Smoothing in Minimum Statistics Based Noise Tracking}}},
  year         = {{2015}},
}

@inproceedings{11813,
  abstract     = {{The parametric Bayesian Feature Enhancement (BFE) and a datadriven Denoising Autoencoder (DA) both bring performance gains in severe single-channel speech recognition conditions. The first can be adjusted to different conditions by an appropriate parameter setting, while the latter needs to be trained on conditions similar to the ones expected at decoding time, making it vulnerable to a mismatch between training and test conditions. We use a DNN backend and study reverberant ASR under three types of mismatch conditions: different room reverberation times, different speaker to microphone distances and the difference between artificially reverberated data and the recordings in a reverberant environment. We show that for these mismatch conditions BFE can provide the targets for a DA. This unsupervised adaptation provides a performance gain over the direct use of BFE and even enables to compensate for the mismatch of real and simulated reverberant data.}},
  author       = {{Heymann, Jahn and Haeb-Umbach, Reinhold and Golik, P. and Schlueter, R.}},
  booktitle    = {{Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on}},
  keywords     = {{codecs, signal denoising, speech recognition, Bayesian feature enhancement, denoising autoencoder, reverberant ASR, single-channel speech recognition, speaker to microphone distances, unsupervised adaptation, Adaptation models, Noise reduction, Reverberation, Speech, Speech recognition, Training, deep neuronal networks, denoising autoencoder, feature enhancement, robust speech recognition}},
  pages        = {{5053--5057}},
  title        = {{{Unsupervised adaptation of a denoising autoencoder by Bayesian Feature Enhancement for reverberant asr under mismatch conditions}}},
  doi          = {{10.1109/ICASSP.2015.7178933}},
  year         = {{2015}},
}

@inproceedings{57964,
  author       = {{Pieczewski, Julia and Neuschmelting, Volker and Thiele, Kristina and Grefkes, Christian and Goldbrunner, Roland and Weiss Lucas, Carolin }},
  keywords     = {{610 Medical sciences, Medicine, reliability, speech mapping, TMS}},
  title        = {{{Good retest reliability of the rate of speech errors evoked by 10 Hz navigated repetitive transcranial magnetic stimulation in healthy volunteers: German Medical Science GMS Publishing House}}},
  doi          = {{10.3205/15dgnc394}},
  year         = {{2015}},
}

@inproceedings{11753,
  abstract     = {{This contribution describes a step-wise source counting algorithm to determine the number of speakers in an offline scenario. Each speaker is identified by a variational expectation maximization (VEM) algorithm for complex Watson mixture models and therefore directly yields beamforming vectors for a subsequent speech separation process. An observation selection criterion is proposed which improves the robustness of the source counting in noise. The algorithm is compared to an alternative VEM approach with Gaussian mixture models based on directions of arrival and shown to deliver improved source counting accuracy. The article concludes by extending the offline algorithm towards a low-latency online estimation of the number of active sources from the streaming input data.}},
  author       = {{Drude, Lukas and Chinaev, Aleksej and Tran Vu, Dang Hai and Haeb-Umbach, Reinhold}},
  booktitle    = {{14th International Workshop on Acoustic Signal Enhancement (IWAENC 2014)}},
  keywords     = {{Accuracy, Acoustics, Estimation, Mathematical model, Soruce separation, Speech, Vectors, Bayes methods, Blind source separation, Directional statistics, Number of speakers, Speaker diarization}},
  pages        = {{213--217}},
  title        = {{{Towards Online Source Counting in Speech Mixtures Applying a Variational EM for Complex Watson Mixture Models}}},
  year         = {{2014}},
}

@article{11861,
  abstract     = {{In this contribution we present a theoretical and experimental investigation into the effects of reverberation and noise on features in the logarithmic mel power spectral domain, an intermediate stage in the computation of the mel frequency cepstral coefficients, prevalent in automatic speech recognition (ASR). Gaining insight into the complex interaction between clean speech, noise, and noisy reverberant speech features is essential for any ASR system to be robust against noise and reverberation present in distant microphone input signals. The findings are gathered in a probabilistic formulation of an observation model which may be used in model-based feature compensation schemes. The proposed observation model extends previous models in three major directions: First, the contribution of additive background noise to the observation error is explicitly taken into account. Second, an energy compensation constant is introduced which ensures an unbiased estimate of the reverberant speech features, and, third, a recursive variant of the observation model is developed resulting in reduced computational complexity when used in model-based feature compensation. The experimental section is used to evaluate the accuracy of the model and to describe how its parameters can be determined from test data.}},
  author       = {{Leutnant, Volker and Krueger, Alexander and Haeb-Umbach, Reinhold}},
  issn         = {{2329-9290}},
  journal      = {{IEEE/ACM Transactions on Audio, Speech, and Language Processing}},
  keywords     = {{computational complexity, reverberation, speech recognition, automatic speech recognition, background noise, clean speech, computational complexity, energy compensation, logarithmic mel power spectral domain, mel frequency cepstral coefficients, microphone input signals, model-based feature compensation schemes, noisy reverberant speech automatic recognition, noisy reverberant speech features, reverberation, Atmospheric modeling, Computational modeling, Noise, Noise measurement, Reverberation, Speech, Vectors, Model-based feature compensation, observation model for reverberant and noisy speech, recursive observation model, robust automatic speech recognition}},
  number       = {{1}},
  pages        = {{95--109}},
  title        = {{{A New Observation Model in the Logarithmic Mel Power Spectral Domain for the Automatic Recognition of Noisy Reverberant Speech}}},
  doi          = {{10.1109/TASLP.2013.2285480}},
  volume       = {{22}},
  year         = {{2014}},
}

@article{11867,
  abstract     = {{New waves of consumer-centric applications, such as voice search and voice interaction with mobile devices and home entertainment systems, increasingly require automatic speech recognition (ASR) to be robust to the full range of real-world noise and other acoustic distorting conditions. Despite its practical importance, however, the inherent links between and distinctions among the myriad of methods for noise-robust ASR have yet to be carefully studied in order to advance the field further. To this end, it is critical to establish a solid, consistent, and common mathematical foundation for noise-robust ASR, which is lacking at present. This article is intended to fill this gap and to provide a thorough overview of modern noise-robust techniques for ASR developed over the past 30 years. We emphasize methods that are proven to be successful and that are likely to sustain or expand their future applicability. We distill key insights from our comprehensive overview in this field and take a fresh look at a few old problems, which nevertheless are still highly relevant today. Specifically, we have analyzed and categorized a wide range of noise-robust techniques using five different criteria: 1) feature-domain vs. model-domain processing, 2) the use of prior knowledge about the acoustic environment distortion, 3) the use of explicit environment-distortion models, 4) deterministic vs. uncertainty processing, and 5) the use of acoustic models trained jointly with the same feature enhancement or model adaptation process used in the testing stage. With this taxonomy-oriented review, we equip the reader with the insight to choose among techniques and with the awareness of the performance-complexity tradeoffs. The pros and cons of using different noise-robust ASR techniques in practical application scenarios are provided as a guide to interested practitioners. The current challenges and future research directions in this field is also carefully analyzed.}},
  author       = {{Li, Jinyu and Deng, Li and Gong, Yifan and Haeb-Umbach, Reinhold}},
  journal      = {{IEEE Transactions on Audio, Speech and Language Processing}},
  keywords     = {{Speech recognition, compensation, distortion modeling, joint model training, noise, robustness, uncertainty processing}},
  number       = {{4}},
  pages        = {{745--777}},
  title        = {{{An Overview of Noise-Robust Automatic Speech Recognition}}},
  doi          = {{10.1109/TASLP.2014.2304637}},
  volume       = {{22}},
  year         = {{2014}},
}

