<?xml version="1.0" encoding="UTF-8"?>

<modsCollection xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/mods/v3" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-3.xsd">
<mods version="3.3">

<genre>conference paper</genre>

<titleInfo><title>End-to-End Training of Time Domain Audio Separation and Recognition</title></titleInfo>




<note type="qualityControlled">yes</note>

<name type="personal">
  <namePart type="given">Thilo</namePart>
  <namePart type="family">von Neumann</namePart>
  <role><roleTerm type="text">author</roleTerm> </role><identifier type="local">49870</identifier><description xsi:type="identifierDefinition" type="orcid">https://orcid.org/0000-0002-7717-8670</description></name>
<name type="personal">
  <namePart type="given">Keisuke</namePart>
  <namePart type="family">Kinoshita</namePart>
  <role><roleTerm type="text">author</roleTerm> </role></name>
<name type="personal">
  <namePart type="given">Lukas</namePart>
  <namePart type="family">Drude</namePart>
  <role><roleTerm type="text">author</roleTerm> </role></name>
<name type="personal">
  <namePart type="given">Christoph</namePart>
  <namePart type="family">Boeddeker</namePart>
  <role><roleTerm type="text">author</roleTerm> </role><identifier type="local">40767</identifier></name>
<name type="personal">
  <namePart type="given">Marc</namePart>
  <namePart type="family">Delcroix</namePart>
  <role><roleTerm type="text">author</roleTerm> </role></name>
<name type="personal">
  <namePart type="given">Tomohiro</namePart>
  <namePart type="family">Nakatani</namePart>
  <role><roleTerm type="text">author</roleTerm> </role></name>
<name type="personal">
  <namePart type="given">Reinhold</namePart>
  <namePart type="family">Haeb-Umbach</namePart>
  <role><roleTerm type="text">author</roleTerm> </role><identifier type="local">242</identifier></name>







<name type="corporate">
  <namePart></namePart>
  <identifier type="local">54</identifier>
  <role>
    <roleTerm type="text">department</roleTerm>
  </role>
</name>





<name type="corporate">
  <namePart>Computing Resources Provided by the Paderborn Center for Parallel Computing</namePart>
  <role><roleTerm type="text">project</roleTerm></role>
</name>



<abstract lang="eng">The rising interest in single-channel multi-speaker speech separation sparked development of End-to-End (E2E) approaches to multispeaker speech recognition. However, up until now, state-of-theart neural network–based time domain source separation has not yet been combined with E2E speech recognition. We here demonstrate how to combine a separation module based on a Convolutional Time domain Audio Separation Network (Conv-TasNet) with an E2E speech recognizer and how to train such a model jointly by distributing it over multiple GPUs or by approximating truncated back-propagation for the convolutional front-end. To put this work into perspective and illustrate the complexity of the design space, we provide a compact overview of single-channel multi-speaker recognition systems. Our experiments show a word error rate of 11.0% on WSJ0-2mix and indicate that our joint time domain model can yield substantial improvements over cascade DNN-HMM and monolithic E2E frequency domain systems proposed so far.</abstract>

<relatedItem type="constituent">
  <location>
    <url displayLabel="ICASSP_2020_vonNeumann_Paper.pdf">https://ris.uni-paderborn.de/download/20762/20763/ICASSP_2020_vonNeumann_Paper.pdf</url>
  </location>
  <physicalDescription><internetMediaType>application/pdf</internetMediaType></physicalDescription><accessCondition type="restrictionOnAccess">no</accessCondition>
</relatedItem>
<originInfo><dateIssued encoding="w3cdtf">2020</dateIssued>
</originInfo>
<language><languageTerm authority="iso639-2b" type="code">eng</languageTerm>
</language>



<relatedItem type="host"><titleInfo><title>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</title></titleInfo><identifier type="doi">10.1109/ICASSP40776.2020.9053461</identifier>
<part><extent unit="pages">7004-7008</extent>
</part>
</relatedItem>


<extension>
<bibliographicCitation>
<ieee>T. von Neumann &lt;i&gt;et al.&lt;/i&gt;, “End-to-End Training of Time Domain Audio Separation and Recognition,” in &lt;i&gt;ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)&lt;/i&gt;, 2020, pp. 7004–7008, doi: &lt;a href=&quot;https://doi.org/10.1109/ICASSP40776.2020.9053461&quot;&gt;10.1109/ICASSP40776.2020.9053461&lt;/a&gt;.</ieee>
<chicago>Neumann, Thilo von, Keisuke Kinoshita, Lukas Drude, Christoph Boeddeker, Marc Delcroix, Tomohiro Nakatani, and Reinhold Haeb-Umbach. “End-to-End Training of Time Domain Audio Separation and Recognition.” In &lt;i&gt;ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)&lt;/i&gt;, 7004–8, 2020. &lt;a href=&quot;https://doi.org/10.1109/ICASSP40776.2020.9053461&quot;&gt;https://doi.org/10.1109/ICASSP40776.2020.9053461&lt;/a&gt;.</chicago>
<ama>von Neumann T, Kinoshita K, Drude L, et al. End-to-End Training of Time Domain Audio Separation and Recognition. In: &lt;i&gt;ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)&lt;/i&gt;. ; 2020:7004-7008. doi:&lt;a href=&quot;https://doi.org/10.1109/ICASSP40776.2020.9053461&quot;&gt;10.1109/ICASSP40776.2020.9053461&lt;/a&gt;</ama>
<mla>von Neumann, Thilo, et al. “End-to-End Training of Time Domain Audio Separation and Recognition.” &lt;i&gt;ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)&lt;/i&gt;, 2020, pp. 7004–08, doi:&lt;a href=&quot;https://doi.org/10.1109/ICASSP40776.2020.9053461&quot;&gt;10.1109/ICASSP40776.2020.9053461&lt;/a&gt;.</mla>
<bibtex>@inproceedings{von Neumann_Kinoshita_Drude_Boeddeker_Delcroix_Nakatani_Haeb-Umbach_2020, title={End-to-End Training of Time Domain Audio Separation and Recognition}, DOI={&lt;a href=&quot;https://doi.org/10.1109/ICASSP40776.2020.9053461&quot;&gt;10.1109/ICASSP40776.2020.9053461&lt;/a&gt;}, booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, author={von Neumann, Thilo and Kinoshita, Keisuke and Drude, Lukas and Boeddeker, Christoph and Delcroix, Marc and Nakatani, Tomohiro and Haeb-Umbach, Reinhold}, year={2020}, pages={7004–7008} }</bibtex>
<short>T. von Neumann, K. Kinoshita, L. Drude, C. Boeddeker, M. Delcroix, T. Nakatani, R. Haeb-Umbach, in: ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2020, pp. 7004–7008.</short>
<apa>von Neumann, T., Kinoshita, K., Drude, L., Boeddeker, C., Delcroix, M., Nakatani, T., &amp;#38; Haeb-Umbach, R. (2020). End-to-End Training of Time Domain Audio Separation and Recognition. &lt;i&gt;ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)&lt;/i&gt;, 7004–7008. &lt;a href=&quot;https://doi.org/10.1109/ICASSP40776.2020.9053461&quot;&gt;https://doi.org/10.1109/ICASSP40776.2020.9053461&lt;/a&gt;</apa>
</bibliographicCitation>
</extension>
<recordInfo><recordIdentifier>20762</recordIdentifier><recordCreationDate encoding="w3cdtf">2020-12-16T14:07:54Z</recordCreationDate><recordChangeDate encoding="w3cdtf">2023-11-15T12:17:45Z</recordChangeDate>
</recordInfo>
</mods>
</modsCollection>
