@article{piater2024selfattention, type = {article}, key = {piater2024selfattention}, title = {Self-Attention for Medical Imaging - On the need for evaluations beyond mere benchmarking}, author = {Tristan Piater and Niklas Penzel and Gideon Stein and Joachim Denzler}, journal = {Communications in Computer and Information Science}, year = {2025}, month = {}, pages = {}, doi = {}, url = {}, publisher = {Springer International Publishing}, abstract = {A considerable amount of research has been dedicated to creating systems that aid medical professionals in labor-intensive early screening tasks, which, to this date, often leverage convolutional deep-learning architectures. Recently, several studies have explored the application of self-attention mechanisms in the field of computer vision. These studies frequently demonstrate empirical improvements over traditional, fully convolutional approaches across a range of datasets and tasks. To assess this trend for medical imaging, we enhance two commonly used convolutional architectures with various self-attention mechanisms and evaluate them on two distinct medical datasets. We compare these enhanced architectures with similarly sized convolutional and attention-based baselines and rigorously assess performance gains through statistical evaluation. Furthermore, we investigate how the inclusion of self-attention influences the features learned by these models by assessing global and local explanations of model behavior. Contrary to our expectations, after performing an appropriate hyperparameter search, self-attention-enhanced architectures show no significant improvements in balanced accuracy compared to the evaluated baselines. Further, we find that relevant global features like dermoscopic structures in skin lesion images are not properly learned by any architecture. Finally, by assessing local explanations, we find that the inherent interpretability of self-attention mechanisms does not provide additional insights. Out-of-the-box model-agnostic approaches can provide explanations that are similar or even more faithful to the actual model behavior. We conclude that simply integrating attention mechanisms is unlikely to lead to a consistent increase in performance compared to fully convolutional methods in medical imaging applications.}, note = {(in press)}, }