@inproceedings{62163,
  abstract     = {{Zero-shot classifiers based on Contrastive Language-Audio Pretraining (CLAP) models enable classification of given audio into classes defined at test time using text. These models are costly to run with respect to computation and memory requirements. In this work, we propose to build a specialized low-resource classifier for classes pre-defined using text, using a two-stage procedure consisting of zero-shot data set pruning and model compression. First, relevant in-domain data is selected from a source dataset using class label embeddings obtained from a pre-trained CLAP model. This data is then used to distill the audio encoder of a CLAP model. The proposed compression method produces compact audio encoders with slightly reduced accuracy. Note that neither labeled nor unlabeled in-domain audio data is required for its development. We verify by cross-dataset tests that the resulting classifiers are indeed specialized to their task.}},
  author       = {{Werning, Alexander and Häb-Umbach, Reinhold}},
  booktitle    = {{Proceedings of the 16th ITG Conference on Speech Communication}},
  editor       = {{Möller, Sebastian and Gerkmann, Timo and Kolossa, Dorothea}},
  location     = {{Berlin}},
  pages        = {{76--80}},
  title        = {{{A Fully Zero-Shot Approach to Obtaining Specialized and Compact Audio Tagging Models}}},
  year         = {{2025}},
}

@inproceedings{59900,
  abstract     = {{Running state-of-the-art large-scale audio models on edge devices is often infeasible due to their limited storage and computing resources. It is therefore necessary to compress and tune the models for the specific target task and hardware. This is commonly achieved by distilling the audio model, the teacher, to a small target model, the student. However, this approach can be improved by prepending a dataset pruning stage and training the teacher on the pruned data set only, which contains examples relevant to the target task. Recently, CLAP models have emerged that embed audio and text examples in a common embedding space. We use the audio embeddings of the CLAP model for the above pruning stage, which is realized using a domain classifier. After knowledge distillation, the student is eventually fine-tuned on some data from the target domain. The CLAP architecture combines text and audio embedding spaces, which allows to search for data given only a textual description, such as a class label. We show how this can help data pruning.}},
  author       = {{Werning, Alexander and Häb-Umbach, Reinhold}},
  booktitle    = {{Proceedings of DAS|DAGA 2025}},
  location     = {{Copenhagen}},
  title        = {{{Distilling Efficient Audio Models using Data Pruning with CLAP}}},
  year         = {{2025}},
}

@techreport{57161,
  author       = {{Werning, Alexander and Haeb-Umbach, Reinhold}},
  title        = {{{UPB-NT submission to DCASE24: Dataset pruning for targeted knowledge distillation}}},
  year         = {{2024}},
}

@inproceedings{57160,
  abstract     = {{Large audio tagging models are usually trained or pre-trained on AudioSet, a dataset that encompasses a large amount of different sound classes and acoustic environments. Knowledge distillation has emerged as a method to compress such models without compromising their effectiveness. There are many different applications for audio tagging, some of which require a specialization to a narrow domain of sounds to be classified. For these scenarios, it is beneficial to distill the large audio tagger with respect to a specific subset of sounds of interest. A method to prune a general dataset with respect to a target dataset is presented. By distilling with such a specialized pruned dataset, we obtain a compressed model with better classification accuracy in the specific target domain than with target-agnostic distillation.}},
  author       = {{Werning, Alexander and Haeb-Umbach, Reinhold}},
  booktitle    = {{32nd European Signal Processing Conference (EUSIPCO 2024)}},
  keywords     = {{data pruning, knowledge distillation, audio tagging}},
  location     = {{Lyon}},
  title        = {{{Target-Specific Dataset Pruning for Compression of Audio Tagging Models}}},
  year         = {{2024}},
}

