@inproceedings{penzel2023interpreting, type = {inproceedings}, key = {penzel2023interpreting}, title = {Interpreting Art by Leveraging Pre-Trained Models}, author = {Niklas Penzel and Joachim Denzler}, booktitle = {International Conference on Machine Vision and Applications (MVA)}, year = {2023}, pages = {1-6}, doi = {10.23919/MVA57639.2023.10216010}, abstract = {In many domains, so-called foundation models were recently proposed. These models are trained on immense amounts of data resulting in impressive performances on various downstream tasks and benchmarks. Later works focus on leveraging this pre-trained knowledge by combining these models. To reduce data and compute requirements, we utilize and combine foundation models in two ways. First, we use language and vision models to extract and generate a challenging language vision task in the form of artwork interpretation pairs. Second, we combine and fine-tune CLIP as well as GPT-2 to reduce compute requirements for training interpretation models. We perform a qualitative and quantitative analysis of our data and conclude that generating artwork leads to improvements in visual-text alignment and, therefore, to more proficient interpretation models. Our approach addresses how to leverage and combine pre-trained models to tackle tasks where existing data is scarce or difficult to obtain.}, groups = {aesthetics}, }