@inproceedings{piater2025prompt,
    type = {inproceedings},
    key = {piater2025prompt},
    author = {Tristan Piater and Björn Barz and Alexander Freytag},
    booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPR-WS)},
    title = {Prompt-Tuning SAM: From Generalist to Specialist with Only 2,048 Parameters and 16 Training Images},
    year = {2025},
    month = {June},
    volume = {},
    number = {},
    pages = {4688-4698},
    abstract = {The Segment Anything Model (SAM) is widely used for segmenting a diverse range of objects in natural images from simple user prompts like points or bounding boxes. However, SAM's performance decreases substantially when applied to non-natural domains like microscopic imaging. Furthermore, due to SAM's interactive design, it requires a precise prompt for each image and object, which is un-feasible in many automated biomedical applications. Previous solutions adapt SAM by training millions of parameters via fine-tuning large parts of the model or of adapter layers. In contrast, we show that as little as 2,048 additional parameters are sufficient for turning SAM into a usecase specialist for a certain downstream task. Our novel PTSAM (prompt-tuned SAM) method uses prompt-tuning, a parameter-efficient fine-tuning technique, to adapt SAM for a specific task. We validate the performance of our approach on multiple microscopic and one medical dataset. Our results show that prompt-tuning only SAM's mask decoder already leads to a performance on-par with state-of-the-art techniques while requiring roughly 2,000× less trainable parameters. For addressing domain gaps, we find that additionally prompt-tuning SAM's image encoder is beneficial, further improving segmentation accuracy by up to 18% over state-of-the-art results. Since PTSAM can be reliably trained with as little as 16 annotated images, we find it particularly helpful for applications with limited training data and domain shifts.},
    doi = {10.1109/CVPRW67362.2025.00455},
}