@inproceedings{61234,
  abstract     = {{The ability to generate explanations that are understood by explainees is the
quintessence of explainable artificial intelligence. Since understanding
depends on the explainee's background and needs, recent research focused on
co-constructive explanation dialogues, where an explainer continuously monitors
the explainee's understanding and adapts their explanations dynamically. We
investigate the ability of large language models (LLMs) to engage as explainers
in co-constructive explanation dialogues. In particular, we present a user
study in which explainees interact with an LLM in two settings, one of which
involves the LLM being instructed to explain a topic co-constructively. We
evaluate the explainees' understanding before and after the dialogue, as well
as their perception of the LLMs' co-constructive behavior. Our results suggest
that LLMs show some co-constructive behaviors, such as asking verification
questions, that foster the explainees' engagement and can improve understanding
of a topic. However, their ability to effectively monitor the current
understanding and scaffold the explanations accordingly remains limited.}},
  author       = {{Fichtel, Leandra and Spliethöver, Maximilian and Hüllermeier, Eyke and Jimenez, Patricia and Klowait, Nils and Kopp, Stefan and Ngonga Ngomo, Axel-Cyrille and Robrecht, Amelie and Scharlau, Ingrid and Terfloth, Lutz and Vollmer, Anna-Lisa and Wachsmuth, Henning}},
  booktitle    = {{Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue}},
  publisher    = {{Association for Computational Linguistics}},
  title        = {{{Investigating Co-Constructive Behavior of Large Language Models in  Explanation Dialogues}}},
  year         = {{2025}},
}

@inproceedings{59856,
  abstract     = {{Recent advances on instruction fine-tuning have led to the development of various prompting techniques for large language models, such as explicit reasoning steps. However, the success of techniques depends on various parameters, such as the task, language model, and context provided. Finding an effective prompt is, therefore, often a trial-and-error process. Most existing approaches to automatic prompting aim to optimize individual techniques instead of compositions of techniques and their dependence on the input. To fill this gap, we propose an adaptive prompting approach that predicts the optimal prompt composition ad-hoc for a given input. We apply our approach to social bias detection, a highly context-dependent task that requires semantic understanding. We evaluate it with three large language models on three datasets, comparing compositions to individual techniques and other baselines. The results underline the importance of finding an effective prompt composition. Our approach robustly ensures high detection performance, and is best in several settings. Moreover, first experiments on other tasks support its generalizability.}},
  author       = {{Spliethöver, Maximilian and Knebler, Tim and Fumagalli, Fabian and Muschalik, Maximilian and Hammer, Barbara and Hüllermeier, Eyke and Wachsmuth, Henning}},
  booktitle    = {{Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)}},
  editor       = {{Chiruzzo, Luis and Ritter, Alan and Wang, Lu}},
  isbn         = {{979-8-89176-189-6}},
  pages        = {{2421–2449}},
  publisher    = {{Association for Computational Linguistics}},
  title        = {{{Adaptive Prompting: Ad-hoc Prompt Composition for Social Bias Detection}}},
  year         = {{2025}},
}

@inproceedings{58722,
  abstract     = {{Dialects introduce syntactic and lexical variations in language that occur in regional or social groups. Most NLP methods are not sensitive to such variations. This may lead to unfair behavior of the methods, conveying negative bias towards dialect speakers. While previous work has studied dialect-related fairness for aspects like hate speech, other aspects of biased language, such as lewdness, remain fully unexplored. To fill this gap, we investigate performance disparities between dialects in the detection of five aspects of biased language and how to mitigate them. To alleviate bias, we present a multitask learning approach that models dialect language as an auxiliary task to incorporate syntactic and lexical variations. In our experiments with African-American English dialect, we provide empirical evidence that complementing common learning approaches with dialect modeling improves their fairness. Furthermore, the results suggest that multitask learning achieves state-of-the-art performance and helps to detect properties of biased language more reliably.}},
  author       = {{Spliethöver, Maximilian and Menon, Sai Nikhil and Wachsmuth, Henning}},
  booktitle    = {{Findings of the Association for Computational Linguistics: ACL 2024}},
  editor       = {{Ku, Lun-Wei and Martins, Andre and Srikumar, Vivek}},
  pages        = {{9294–9313}},
  publisher    = {{Association for Computational Linguistics}},
  title        = {{{Disentangling Dialect from Social Bias via Multitask Learning to Improve Fairness}}},
  doi          = {{10.18653/v1/2024.findings-acl.553}},
  year         = {{2024}},
}

@inproceedings{34082,
  abstract     = {{Gender bias may emerge from an unequal representation of agency and power, for example, by portraying women frequently as passive and powerless ("She accepted her future'') and men as proactive and powerful ("He chose his future''). When language models learn from respective texts, they may reproduce or even amplify the bias. An effective way to mitigate bias is to generate counterfactual sentences with opposite agency and power to the training. Recent work targeted agency-specific verbs from a lexicon to this end. We argue that this is insufficient, due to the interaction of agency and power and their dependence on context. In this paper, we thus develop a new rewriting model that identifies verbs with the desired agency and power in the context of the given sentence. The verbs' probability is then boosted to encourage the model to rewrite both connotations jointly. According to automatic metrics, our model effectively controls for power while being competitive in agency to the state of the art. In our evaluation, human annotators favored its counterfactuals in terms of both connotations, also deeming its meaning preservation better.}},
  author       = {{Stahl, Maja and Spliethöver, Maximilian and Wachsmuth, Henning}},
  booktitle    = {{Proceedings of the Fifth Workshop on Natural Language Processing and Computational Social Science}},
  location     = {{Abu Dhabi, United Arab Emirates}},
  title        = {{{To Prefer or to Choose? Generating Agency and Power Counterfactuals Jointly for Gender Bias Mitigation}}},
  year         = {{2022}},
}

@inproceedings{22156,
  abstract     = {{Word embedding models reflect bias towards genders, ethnicities, and other social groups present in the underlying training data. Metrics such as ECT, RNSB, and WEAT quantify bias in these models based on predefined word lists representing social groups and bias-conveying concepts. How suitable these lists actually are to reveal bias - let alone the bias metrics in general - remains unclear, though. In this paper, we study how to assess the quality of bias metrics for word embedding models. In particular, we present a generic method, Bias Silhouette Analysis (BSA), that quantifies the accuracy and robustness of such a metric and of the word lists used. Given a biased and an unbiased reference embedding model, BSA applies the metric systematically for several subsets of the lists to the models. The variance and rate of convergence of the bias values of each model then entail the robustness of the word lists, whereas the distance between the models' values gives indications of the general accuracy of the metric with the word lists. We demonstrate the behavior of BSA on two standard embedding models for the three mentioned metrics with several word lists from existing research.}},
  author       = {{Spliethöver, Maximilian and Wachsmuth, Henning}},
  booktitle    = {{Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence, IJCAI-21}},
  location     = {{Online}},
  pages        = {{552--559}},
  title        = {{{Bias Silhouette Analysis: Towards Assessing the Quality of Bias Metrics for Word Embedding Models}}},
  doi          = {{10.24963/ijcai.2021/77}},
  year         = {{2021}},
}

@inproceedings{25297,
  author       = {{Alshomary, Milad and Gurcke, Timon and Syed, Shahbaz and Heinisch, Philipp and Spliethöver, Maximilian and Cimiano, Philipp and Potthast, Martin and Wachsmuth, Henning}},
  booktitle    = {{Proceedings of the 8th Workshop on Argument Mining}},
  pages        = {{184 -- 189}},
  title        = {{{Key Point Analysis via Contrastive Learning and Extractive Argument Summarization}}},
  year         = {{2021}},
}

@inproceedings{20139,
  author       = {{Spliethöver, Maximilian and Wachsmuth, Henning}},
  booktitle    = {{Proceedings of the 7th Workshop on Argument Mining (ArgMining 2020)}},
  pages        = {{76--87}},
  title        = {{{Argument from Old Man's View: Assessing Social Bias in Argumentation}}},
  year         = {{2020}},
}

@inproceedings{21174,
  abstract     = {{Overcoming a range of challenges that traditional therapy faces, VRET yields great potential for the treatment of phobias such as acrophobia, the fear of heights. We investigate this potential and present playful user-generated treatment (PUT), a novel game-based approach for VRET. Based on a requirement analysis consisting of a literature review and semi-structured interviews with professional therapists, we designed and implemented the PUT concept as a two-step VR game design. To validate our approach, we conducted two studies. (1) In a study with 31 non-acrophobic subjects, we investigated the effect of content creation on player experience, motivation and height perception, and (2) in an online survey, we collected feedback from professional therapists. Both studies reveal that the PUT approach is well applicable. In particular, the analysis of the user study shows that the design phase leads to increased interest and enjoyment without notably influencing affective measures during the exposure session. Our work can help guiding researchers and practitioners at the intersection of game design and exposure therapy.}},
  author       = {{Alexandrovsky, Dmitry and Volkmar, Georg and Spliethöver, Maximilian and Finke, Stefan and Herrlich, Marc and Döring, Tanja and Smeddinck, Jan David and Malaka, Rainer}},
  booktitle    = {{Proceedings of the Annual Symposium on Computer-Human Interaction in Play}},
  isbn         = {{9781450380744}},
  pages        = {{32–45}},
  publisher    = {{Association for Computing Machinery}},
  title        = {{{Playful User-Generated Treatment: A Novel Game Design Approach for VR Exposure Therapy}}},
  doi          = {{10.1145/3410404.3414222}},
  year         = {{2020}},
}

@inproceedings{21177,
  abstract     = {{Attention mechanisms have seen some success for natural language processing downstream tasks in recent years and generated new state-of-the-art results. A thorough evaluation of the attention mechanism for the task of Argumentation Mining is missing. With this paper, we report a comparative evaluation of attention layers in combination with a bidirectional long short-term memory network, which is the current state-of-the-art approach for the unit segmentation task. We also compare sentence-level contextualized word embeddings to pre-generated ones. Our findings suggest that for this task, the additional attention layer does not improve the performance. In most cases, contextualized embeddings do also not show an improvement on the score achieved by pre-defined embeddings.}},
  author       = {{Spliethöver, Maximilian and Klaff, Jonas and Heuer, Hendrik}},
  booktitle    = {{Proceedings of the 6th Workshop on Argument Mining}},
  location     = {{Florence, Italy}},
  pages        = {{74--82}},
  publisher    = {{Association for Computational Linguistics}},
  title        = {{{Is It Worth the Attention? A Comparative Evaluation of Attention Layers for Argument Unit Segmentation}}},
  doi          = {{10.18653/v1/W19-4509}},
  year         = {{2019}},
}

@techreport{16847,
  abstract     = {{In this work we describe our results achieved in the ProtestNews Lab at CLEF 2019. To tackle the problems of event sentence detection and event extraction we decided to use contextualized string embeddings. The models were trained on a data corpus collected from Indian news sources, but evaluated on data obtained from news sources from other countries as well, such as China. Our models have obtained competitive results and have scored 3rd in the event sentence detection task and 1st in the event extraction task based on average F1-scores for diﬀerent test datasets.}},
  author       = {{Skitalinskaya, Gabriella and Klaﬀ, Jonas and Spliethöver, Maximilian}},
  pages        = {{7}},
  title        = {{{CLEF ProtestNews Lab 2019: Contextualized Word Embeddings for Event Sentence Detection and Event Extraction}}},
  volume       = {{2380}},
  year         = {{2019}},
}

@inproceedings{21173,
  author       = {{Bonfert, Michael and Spliethöver, Maximilian and Arzaroli, Roman and Lange, Marvin and Hanci, Martin and Porzel, Robert}},
  booktitle    = {{Proceedings of the 20th ACM International Conference on Multimodal Interaction}},
  isbn         = {{9781450356923}},
  title        = {{{If You Ask Nicely: A Digital Assistant Rebuking Impolite Voice Commands}}},
  doi          = {{10.1145/3242969.3242995}},
  year         = {{2018}},
}