@inproceedings{ickler2025text, type = {inproceedings}, key = {ickler2025text}, author = {Christian Ickler and Aishwarya Venkataramanan and Joachim Denzler}, title = {Text-Assisted Zero-Shot Classification of Fine-Grained Animal Species}, booktitle = {International Workshop Series on Camera Traps, AI, \& Ecology (CamTrapAI)}, year = {2025}, abstract = {Fine-grained visual classification of animals is vital for automatic ecological monitoring. Its inherent challenges, including low inter-class and high intra-class variations, are often compounded by data scarcity for rare species. These difficulties are particularly pronounced in zero-shot classification, where models must identify classes without training examples, necessitating auxiliary information like high-quality textual descriptions to accurately discriminate the species. However, text-assisted zero-shot fine-grained classification of animal species remains largely unexplored. In this work, we evaluate a set of CLIP-based methods by appending class descriptions to their text prompts. We then propose a two-stage framework that relies on a large vision language model (LVLM) to compare image features to descriptions. Our approach extends the two-stage framework of CascadeVLM, which uses CLIP in the first stage to select a set of candidate species and refines the predictions in the second stage with an LVLM, by also integrating descriptive texts into the LVLM's prompt, leveraging its in-context learning ability. We evaluate on two benchmark datasets: CUB-200-2011 and EU-Moths, containing fine-grained bird and moth images, respectively. Our results indicate that while producing competitive predictions, our approach still struggles when applying general-purpose foundation models in highly specialised animal domains. Unbalanced performance across stages and LVLM hallucinations highlight the need for more robust zero-shot classification approaches leveraging detailed text descriptions.}, }