@article{65182,
  abstract     = {{<jats:p>The aggregation of rating metrics in reputation systems is crucial for mitigating information overload by condensing customer rating distributions into singular valence scores. While platforms typically employ technical aggregation functions, such as the arithmetic mean to capture product quality, it remains unclear whether these functions align with customers' innate aggregation patterns. To address this knowledge gap, we designed a controlled economic decision experiment to elicit customers' aggregation principles by analyzing their product ranking decisions and contrasting these with various reference functions. Our findings indicate that, on average, customers aggregate rating information in accordance with the arithmetic mean. However, a granular analysis at the individual level reveals significant heterogeneity in aggregation behavior, with a substantial cluster exhibiting binary patterns that focus equally on negative (1-2 star) and positive (4-5 star) ratings. Additional clusters concentrate on negative feedback, particularly 1-star ratings or 1-2 star ratings collectively. Notably, these inherent aggregation patterns exhibit stability across variations in numerical information presentation and are not significantly influenced by individual characteristics, such as online shopping experience, risk attitudes, or demographics. These findings suggest that while the arithmetic mean captures average consumer behavior, platforms could benefit from offering customizable aggregation options to better cater to diverse user preferences for processing rating distributions. By doing so, platforms can enhance the effectiveness of their reputation systems and improve the overall quality of decision-making for consumers.</jats:p>}},
  author       = {{van Straaten, Dirk and Mir Djawadi, Behnud and Melnikov, Vitalik and Hüllermeier, Eyke and Fahr, René}},
  journal      = {{SSRN Electronic Journal}},
  publisher    = {{Elsevier BV}},
  title        = {{{Aggregation Processes in Customer Rating Systems - Insights from an Economic Decision Experiment}}},
  doi          = {{http://dx.doi.org/10.2139/ssrn.6201258}},
  year         = {{2026}},
}

@inproceedings{61234,
  abstract     = {{The ability to generate explanations that are understood by explainees is the
quintessence of explainable artificial intelligence. Since understanding
depends on the explainee's background and needs, recent research focused on
co-constructive explanation dialogues, where an explainer continuously monitors
the explainee's understanding and adapts their explanations dynamically. We
investigate the ability of large language models (LLMs) to engage as explainers
in co-constructive explanation dialogues. In particular, we present a user
study in which explainees interact with an LLM in two settings, one of which
involves the LLM being instructed to explain a topic co-constructively. We
evaluate the explainees' understanding before and after the dialogue, as well
as their perception of the LLMs' co-constructive behavior. Our results suggest
that LLMs show some co-constructive behaviors, such as asking verification
questions, that foster the explainees' engagement and can improve understanding
of a topic. However, their ability to effectively monitor the current
understanding and scaffold the explanations accordingly remains limited.}},
  author       = {{Fichtel, Leandra and Spliethöver, Maximilian and Hüllermeier, Eyke and Jimenez, Patricia and Klowait, Nils and Kopp, Stefan and Ngonga Ngomo, Axel-Cyrille and Robrecht, Amelie and Scharlau, Ingrid and Terfloth, Lutz and Vollmer, Anna-Lisa and Wachsmuth, Henning}},
  booktitle    = {{Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue}},
  publisher    = {{Association for Computational Linguistics}},
  title        = {{{Investigating Co-Constructive Behavior of Large Language Models in  Explanation Dialogues}}},
  year         = {{2025}},
}

@inproceedings{59856,
  abstract     = {{Recent advances on instruction fine-tuning have led to the development of various prompting techniques for large language models, such as explicit reasoning steps. However, the success of techniques depends on various parameters, such as the task, language model, and context provided. Finding an effective prompt is, therefore, often a trial-and-error process. Most existing approaches to automatic prompting aim to optimize individual techniques instead of compositions of techniques and their dependence on the input. To fill this gap, we propose an adaptive prompting approach that predicts the optimal prompt composition ad-hoc for a given input. We apply our approach to social bias detection, a highly context-dependent task that requires semantic understanding. We evaluate it with three large language models on three datasets, comparing compositions to individual techniques and other baselines. The results underline the importance of finding an effective prompt composition. Our approach robustly ensures high detection performance, and is best in several settings. Moreover, first experiments on other tasks support its generalizability.}},
  author       = {{Spliethöver, Maximilian and Knebler, Tim and Fumagalli, Fabian and Muschalik, Maximilian and Hammer, Barbara and Hüllermeier, Eyke and Wachsmuth, Henning}},
  booktitle    = {{Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)}},
  editor       = {{Chiruzzo, Luis and Ritter, Alan and Wang, Lu}},
  isbn         = {{979-8-89176-189-6}},
  pages        = {{2421–2449}},
  publisher    = {{Association for Computational Linguistics}},
  title        = {{{Adaptive Prompting: Ad-hoc Prompt Composition for Social Bias Detection}}},
  year         = {{2025}},
}

@inbook{61820,
  abstract     = {{<jats:title>Abstract</jats:title>
          <jats:p>A scoring list is a sequence of simple decision models, where features are incrementally evaluated and scores of satisfied features are summed to be used for threshold-based decisions or for calculating class probabilities. In this paper, we introduce a new multi-class variant and compare it against previously introduced binary classification variants for incremental decisions, as well as multi-class variants for classical decision-making using all features. Furthermore, we introduce a new multi-class dataset to assess collaborative human-machine decision-making, which is suitable for user studies with non-expert participants. We demonstrate the usefulness of our approach by evaluating predictive performance and compared to the performance of participants without AI help.</jats:p>}},
  author       = {{Heid, Stefan and Kornowicz, Jaroslaw and Hanselle, Jonas and Thommes, Kirsten and Hüllermeier, Eyke}},
  booktitle    = {{Communications in Computer and Information Science}},
  isbn         = {{9783032083265}},
  issn         = {{1865-0929}},
  publisher    = {{Springer Nature Switzerland}},
  title        = {{{MSL: Multi-class Scoring Lists for Interpretable Incremental Decision-Making}}},
  doi          = {{10.1007/978-3-032-08327-2_6}},
  year         = {{2025}},
}

@article{54911,
  author       = {{Heid, Stefan and Hanselle, Jonas Manuel and Fürnkranz, Johannes and Hüllermeier, Eyke}},
  issn         = {{0888-613X}},
  journal      = {{International Journal of Approximate Reasoning}},
  publisher    = {{Elsevier BV}},
  title        = {{{Learning decision catalogues for situated decision making: The case of scoring systems}}},
  doi          = {{10.1016/j.ijar.2024.109190}},
  volume       = {{171}},
  year         = {{2024}},
}

@article{54910,
  author       = {{Heid, Stefan and Hanselle, Jonas Manuel and Fürnkranz, Johannes and Hüllermeier, Eyke}},
  issn         = {{0888-613X}},
  journal      = {{International Journal of Approximate Reasoning}},
  publisher    = {{Elsevier BV}},
  title        = {{{Learning decision catalogues for situated decision making: The case of scoring systems}}},
  doi          = {{10.1016/j.ijar.2024.109190}},
  volume       = {{171}},
  year         = {{2024}},
}

@article{54907,
  author       = {{Heid, Stefan and Hanselle, Jonas Manuel and Fürnkranz, Johannes and Hüllermeier, Eyke}},
  issn         = {{0888-613X}},
  journal      = {{International Journal of Approximate Reasoning}},
  publisher    = {{Elsevier BV}},
  title        = {{{Learning decision catalogues for situated decision making: The case of scoring systems}}},
  doi          = {{10.1016/j.ijar.2024.109190}},
  volume       = {{171}},
  year         = {{2024}},
}

@inproceedings{57645,
  author       = {{Heid, Stefan and Kornowicz, Jaroslaw and Hanselle, Jonas Manuel and Hüllermeier, Eyke and Thommes, Kirsten}},
  booktitle    = {{PROCEEDINGS 34. WORKSHOP COMPUTATIONAL INTELLIGENCE}},
  pages        = {{233}},
  title        = {{{Human-AI Co-Construction of Interpretable Predictive Models: The Case of Scoring Systems}}},
  volume       = {{21}},
  year         = {{2024}},
}

@inproceedings{55631,
  abstract     = {{This paper investigates the remaining useful lifetime (RUL) estimation of bearings under dynamic, i.e., time-varying, operating conditions (OC). Unlike conventional studies that assume constant OC in bearing accelerated life tests, we introduce a dataset with time-varying OC during run-to-failure experiments, simulating real-world scenarios. We explore data-driven approaches to identify the transition point from a healthy to an unhealthy state and estimate the RUL. Additionally, we examine strategies for integrating OC information to enhance RUL estimations. These methodologies are evaluated through numerical experiments using various machine learning algorithms.}},
  author       = {{Javanmardi, Alireza and Aimiyekagbon, Osarenren Kennedy and Bender, Amelie and Kimotho, James Kuria and Sextro, Walter and Hüllermeier, Eyke}},
  booktitle    = {{PHM Society European Conference}},
  isbn         = {{978-1-936263-40-0}},
  location     = {{Prague, Czech Republic}},
  number       = {{1}},
  publisher    = {{PHM Society}},
  title        = {{{Remaining Useful Lifetime Estimation of Bearings Operating under Time-Varying Conditions}}},
  doi          = {{10.36001/phme.2024.v8i1.4101}},
  volume       = {{8}},
  year         = {{2024}},
}

@inproceedings{53073,
  abstract     = {{While shallow decision trees may be interpretable, larger ensemble models like gradient-boosted trees, which often set the state of the art in machine learning problems involving tabular data, still remain black box models. As a remedy, the Shapley value (SV) is a well-known concept in explainable artificial intelligence (XAI) research for quantifying additive feature attributions of predictions. The model-specific TreeSHAP methodology solves the exponential complexity for retrieving exact SVs from tree-based models. Expanding beyond individual feature attribution, Shapley interactions reveal the impact of intricate feature interactions of any order. In this work, we present TreeSHAP-IQ, an efficient method to compute any-order additive Shapley interactions for predictions of tree-based models. TreeSHAP-IQ is supported by a mathematical framework that exploits polynomial arithmetic to compute the interaction scores in a single recursive traversal of the tree, akin to Linear TreeSHAP. We apply TreeSHAP-IQ on state-of-the-art tree ensembles and explore interactions on well-established benchmark datasets.}},
  author       = {{Muschalik, Maximilian and Fumagalli, Fabian and Hammer, Barbara and Huellermeier, Eyke}},
  booktitle    = {{Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)}},
  issn         = {{2374-3468}},
  keywords     = {{Explainable Artificial Intelligence}},
  number       = {{13}},
  pages        = {{14388--14396}},
  title        = {{{Beyond TreeSHAP: Efficient Computation of Any-Order Shapley Interactions for Tree Ensembles}}},
  doi          = {{10.1609/aaai.v38i13.29352}},
  volume       = {{38}},
  year         = {{2024}},
}

@inproceedings{55311,
  abstract     = {{Addressing the limitations of individual attribution scores via the Shapley value (SV), the field of explainable AI (XAI) has recently explored intricate interactions of features or data points. In particular, extensions of the SV, such as the Shapley Interaction Index (SII), have been proposed as a measure to still benefit from the axiomatic basis of the SV. However, similar to the SV, their exact computation remains computationally prohibitive. Hence, we propose with SVARM-IQ a sampling-based approach to efficiently approximate Shapley-based interaction indices of any order. SVARM-IQ can be applied to a broad class of interaction indices, including the SII, by leveraging a novel stratified representation. We provide non-asymptotic theoretical guarantees on its approximation quality and empirically demonstrate that SVARM-IQ achieves state-of-the-art estimation results in practical XAI scenarios on different model classes and application domains.}},
  author       = {{Kolpaczki, Patrick and Muschalik, Maximilian and Fumagalli, Fabian and Hammer, Barbara and Huellermeier, Eyke}},
  booktitle    = {{Proceedings of The 27th International Conference on Artificial Intelligence and Statistics (AISTATS)}},
  pages        = {{3520–3528}},
  publisher    = {{PMLR}},
  title        = {{{SVARM-IQ: Efficient Approximation of Any-order Shapley Interactions through Stratification}}},
  volume       = {{238}},
  year         = {{2024}},
}

@inproceedings{61228,
  author       = {{Muschalik, Maximilian and Baniecki, Hubert and Fumagalli, Fabian and Kolpaczki, Patrick and Hammer, Barbara and Huellermeier, Eyke}},
  booktitle    = {{Advances in Neural Information Processing Systems (NeurIPS)}},
  pages        = {{130324–130357}},
  title        = {{{shapiq: Shapley interactions for machine learning}}},
  volume       = {{37}},
  year         = {{2024}},
}

@inproceedings{51373,
  author       = {{Hanselle, Jonas Manuel and Fürnkranz, Johannes and Hüllermeier, Eyke}},
  booktitle    = {{26th International Conference on Discovery Science }},
  isbn         = {{9783031452741}},
  issn         = {{0302-9743}},
  location     = {{Porto}},
  pages        = {{189--203}},
  publisher    = {{Springer Nature Switzerland}},
  title        = {{{Probabilistic Scoring Lists for Interpretable Machine Learning}}},
  doi          = {{10.1007/978-3-031-45275-8_13}},
  volume       = {{14050}},
  year         = {{2023}},
}

@inbook{54613,
  author       = {{Hanselle, Jonas Manuel and Hüllermeier, Eyke and Mohr, Felix and Ngonga Ngomo, Axel-Cyrille and Sherif, Mohamed and Tornede, Alexander and Wever, Marcel Dominik}},
  booktitle    = {{On-The-Fly Computing – Individualized IT-services in dynamic markets}},
  editor       = {{Haake, Claus-Jochen and Meyer auf der Heide, Friedhelm and Platzner, Marco and Wachsmuth, Henning and Wehrheim, Heike}},
  keywords     = {{dice ngonga sfb901 sherif}},
  pages        = {{85–104}},
  publisher    = {{Heinz Nixdorf Institut, Universität Paderborn}},
  title        = {{{Configuration and Evaluation}}},
  doi          = {{10.5281/zenodo.8068466}},
  volume       = {{412}},
  year         = {{2023}},
}

@inbook{54909,
  author       = {{Hanselle, Jonas Manuel and Fürnkranz, Johannes and Hüllermeier, Eyke}},
  booktitle    = {{Discovery Science}},
  isbn         = {{9783031452741}},
  issn         = {{0302-9743}},
  publisher    = {{Springer Nature Switzerland}},
  title        = {{{Probabilistic Scoring Lists for Interpretable Machine Learning}}},
  doi          = {{10.1007/978-3-031-45275-8_13}},
  year         = {{2023}},
}

@unpublished{44512,
  abstract     = {{For open world applications, deep neural networks (DNNs) need to be aware of
previously unseen data and adaptable to evolving environments. Furthermore, it
is desirable to detect and learn novel classes which are not included in the
DNNs underlying set of semantic classes in an unsupervised fashion. The method
proposed in this article builds upon anomaly detection to retrieve
out-of-distribution (OoD) data as candidates for new classes. We thereafter
extend the DNN by $k$ empty classes and fine-tune it on the OoD data samples.
To this end, we introduce two loss functions, which 1) entice the DNN to assign
OoD samples to the empty classes and 2) to minimize the inner-class feature
distances between them. Thus, instead of ground truth which contains labels for
the different novel classes, the DNN obtains a single OoD label together with a
distance matrix, which is computed in advance. We perform several experiments
for image classification and semantic segmentation, which demonstrate that a
DNN can extend its own semantic space by multiple classes without having access
to ground truth.}},
  author       = {{Uhlemeyer, Svenja and Lienen, Julian and Hüllermeier, Eyke and Gottschalk, Hanno}},
  booktitle    = {{arXiv:2305.00983}},
  title        = {{{Detecting Novelties with Empty Classes}}},
  year         = {{2023}},
}

@inproceedings{31880,
  abstract     = {{The notion of neural collapse refers to several emergent phenomena that have been empirically observed across various canonical classification problems. During the terminal phase of training a deep neural network, the feature embedding of all examples of the same class tend to collapse to a single representation, and the features of different classes tend to separate as much as possible. Neural collapse is often studied through a simplified model, called the unconstrained feature representation, in which the model is assumed to have "infinite expressivity" and can map each data point to any arbitrary representation. In this work, we propose a more realistic variant of the unconstrained feature representation that takes the limited expressivity of the network into account. Empirical evidence suggests that the memorization of noisy data points leads to a degradation (dilation) of the neural collapse. Using a model of the memorization-dilation (M-D) phenomenon, we show one mechanism by which different losses lead to different performances of the trained network on noisy data. Our proofs reveal why label smoothing, a modification of cross-entropy empirically observed to produce a regularization effect, leads to improved generalization in classification tasks.}},
  author       = {{Nguyen, Duc Anh and Levie, Ron and Lienen, Julian and Kutyniok, Gitta and Hüllermeier, Eyke}},
  booktitle    = {{International Conference on Learning Representations, ICLR}},
  location     = {{Kigali, Ruanda}},
  title        = {{{Memorization-Dilation: Modeling Neural Collapse Under Noise}}},
  year         = {{2023}},
}

@inbook{45884,
  author       = {{Hanselle, Jonas Manuel and Hüllermeier, Eyke and Mohr, Felix and Ngonga Ngomo, Axel-Cyrille and Sherif, Mohamed and Tornede, Alexander and Wever, Marcel Dominik}},
  booktitle    = {{On-The-Fly Computing -- Individualized IT-services in dynamic markets}},
  editor       = {{Haake, Claus-Jochen and Meyer auf der Heide, Friedhelm and Platzner, Marco and Wachsmuth, Henning and Wehrheim, Heike}},
  pages        = {{85--104}},
  publisher    = {{Heinz Nixdorf Institut, Universität Paderborn}},
  title        = {{{Configuration and Evaluation}}},
  doi          = {{10.5281/zenodo.8068466}},
  volume       = {{412}},
  year         = {{2023}},
}

@inbook{45886,
  author       = {{Wehrheim, Heike and Hüllermeier, Eyke and Becker, Steffen and Becker, Matthias and Richter, Cedric and Sharma, Arnab}},
  booktitle    = {{On-The-Fly Computing -- Individualized IT-services in dynamic markets}},
  editor       = {{Haake, Claus-Jochen and Meyer auf der Heide, Friedhelm and Platzner, Marco and Wachsmuth, Henning and Wehrheim, Heike}},
  pages        = {{105--123}},
  publisher    = {{Heinz Nixdorf Institut, Universität Paderborn}},
  title        = {{{Composition Analysis in Unknown Contexts}}},
  doi          = {{10.5281/zenodo.8068510}},
  volume       = {{412}},
  year         = {{2023}},
}

@unpublished{45911,
  abstract     = {{Label noise poses an important challenge in machine learning, especially in
deep learning, in which large models with high expressive power dominate the
field. Models of that kind are prone to memorizing incorrect labels, thereby
harming generalization performance. Many methods have been proposed to address
this problem, including robust loss functions and more complex label correction
approaches. Robust loss functions are appealing due to their simplicity, but
typically lack flexibility, while label correction usually adds substantial
complexity to the training setup. In this paper, we suggest to address the
shortcomings of both methodologies by "ambiguating" the target information,
adding additional, complementary candidate labels in case the learner is not
sufficiently convinced of the observed training label. More precisely, we
leverage the framework of so-called superset learning to construct set-valued
targets based on a confidence threshold, which deliver imprecise yet more
reliable beliefs about the ground-truth, effectively helping the learner to
suppress the memorization effect. In an extensive empirical evaluation, our
method demonstrates favorable learning behavior on synthetic and real-world
noise, confirming the effectiveness in detecting and correcting erroneous
training labels.}},
  author       = {{Lienen, Julian and Hüllermeier, Eyke}},
  booktitle    = {{arXiv:2305.13764}},
  title        = {{{Mitigating Label Noise through Data Ambiguation}}},
  year         = {{2023}},
}